In [2]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [4]:
# file_name = "Project_CodeNet_Python800.tar.gz"
# data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# # Download tar archive to local disk
# with open(file_name, "wb") as f:
#     f.write(requests.get(data_url).content)
    
# # Extract contents of archive to local disk
# if os.path.exists("data"):
#     shutil.rmtree("data")    
# with tarfile.open(file_name) as tfile:
#     tfile.extractall()

In [5]:
# file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
# len(file_paths)

In [6]:
# file_paths[0:10]

In [7]:
lits = json.load(open("literals.json"))

def process_string(token, special_chars={" ": "U+0020", ",": "U+002C"}):
    str_quote_options = ["'''", '"""', "'", '"']
    start_quote = ""
    end_quote = ""
    qualifier_regex = r"^[a-z]+"
    qualifier_match = re.search(qualifier_regex, token)
    # string qualifiers like 'r' for regex, 'f' for formatted string, 'b' for bytes, 'u' for unicode, etc (or combination of them)
    qualifier = "" if not qualifier_match else qualifier_match[0]
    # token string without qualifiers
    token_string = re.sub(qualifier_regex, "", token)
    # string literal without quotes
    str_lit = token_string
    for q in str_quote_options:
        if token_string.startswith(q):
            start_quote = q
            str_lit = str_lit[len(q) :]
            if token_string.endswith(q):
                end_quote = q
                str_lit = str_lit[: -len(q)]
            break
    # if start_quote in str_quote_options[:2]:
    #     return ""
    for sc in special_chars:
        str_lit = str_lit.replace(sc, special_chars[sc])
    return (
        f"{qualifier}{start_quote}<STR_LIT:{str_lit}>{end_quote}"
        if str_lit in lits['str']
        else f"{qualifier}{start_quote}<STR_LIT>{end_quote}"
    )

def py_tokenize():
    file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
    wf = open(os.path.join(os.getcwd(), f"full_corpus.txt"), 'w')
    local_corpus = []
    for path in file_paths:
        try:
            code = open(path).read()
            token_gen = tokenize(BytesIO(bytes(code, "utf8")).readline)
            out_tokens = []
            prev_eol = False
            for toknum, tokval, _, _, _ in token_gen:
                tokval = " ".join(tokval.split())
                if toknum == STRING:
                    add_token = process_string(tokval)
                    out_tokens.append(add_token)
                    prev_eol = False
                elif toknum == NUMBER:
                    if tokval in lits['num']:
                        out_tokens.append(f"<NUM_LIT:{tokval}>")
                    else:
                        out_tokens.append(f"<NUM_LIT>")
                    prev_eol = False
                elif toknum in [NEWLINE, NL]:
                    if not prev_eol:
                        out_tokens.append("<EOL>")
                        prev_eol = True
                elif toknum in [COMMENT, INDENT, ENCODING, ENDMARKER] or len(tokval) == 0:
                    continue
                else:
                    out_tokens.append(tokval)
                    prev_eol = False
            if out_tokens[0] == "<EOL>":
                out_tokens = out_tokens[1:]
            if out_tokens[-1] == "<EOL>":
                out_tokens = out_tokens[:-1]
        except Exception:
            out_tokens = []
#         local_corpus.extend((" ".join(out_tokens)).split('<EOL>'))
#         out_tokens = ["<s>"] + out_tokens + ["</s>"]
        out = " ".join(out_tokens)
        local_corpus.append(out)
        wf.write(out+"\n")
    print(f"Full Corpus is done")
    wf.close()
    return local_corpus

def read_corpus():
    corpus = py_tokenize()
    full_corpus = ''.join(corpus)
    corpus_new = []
    for code in corpus:
        corpus_new.extend(code.split('<EOL>'))
        
    return pd.DataFrame(corpus_new)

In [8]:
# train_corpus = read_corpus()

In [9]:
# train_corpus[0:20]

In [10]:
corpus = open('full_corpus.txt', encoding='utf8').readlines()

In [11]:
text_corpus = []
for code in corpus:
    text_corpus.extend(code.split(' <EOL> '))

In [12]:
text_corpus[0:10]

['e = enumerate',
 'n , * a = map ( int , open ( <NUM_LIT:0> ) . read ( ) . split ( ) )',
 'd = [ <NUM_LIT:0> ]',
 'for j , ( a , i ) in e ( sorted ( ( a , i ) for i , a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ) , d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1> , j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ]',
 'print ( max ( d ) )\n',
 'N = int ( input ( ) )',
 'A = list ( map ( int , input ( ) . split ( ) ) )',
 'table = [ ]',
 'for i , a in enumerate ( A ) :',
 'table . append ( [ a , i ] )']

In [13]:
train_sent = text_corpus[0:int(0.8*len(text_corpus))]
test_sent = text_corpus[int(0.8*len(text_corpus)):]

In [14]:
test_sent[0:10]

['exit ( )',
 'elif <NUM_LIT:0> < int ( S [ <NUM_LIT:2> : ] ) <= <NUM_LIT:12> :',
 "print ( '<STR_LIT>' )",
 'exit ( )',
 'else :',
 "print ( '<STR_LIT>' )",
 'exit ( )\n',
 'S = input ( )',
 'x = int ( S [ : <NUM_LIT:2> ] )',
 'y = int ( S [ <NUM_LIT:2> : <NUM_LIT:4> ] )']

In [15]:
import pickle
with open("train_sent.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(train_sent))
with open("test_sent.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(test_sent))

In [16]:
import torch
print (torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


In [17]:
torch.cuda.set_device(0)
print(torch.cuda.current_device())

0


In [18]:
from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling, RobertaTokenizer

In [19]:
# tokenizer = OpenAIGPTTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

In [20]:
model = OpenAIGPTLMHeadModel.from_pretrained('congcongwang/gpt2_medium_fine_tuned_coder')

You are using a model of type gpt2 to instantiate a model of type openai-gpt. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at congcongwang/gpt2_medium_fine_tuned_coder were not used when initializing OpenAIGPTLMHeadModel: ['transformer.h.6.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.21.attn.masked_bias', 'transformer.h.23.attn.masked_bias', 'transformer.h.22.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.20.attn.masked_bias', 'transformer.ln_f.weight', 'transformer.h.5.attn.masked_bias', 'transformer.ln_f.bias', 'transformer.wpe.weight', 'transformer.h.13.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.17.attn.masked_bias', 'transfor

In [21]:
use_cuda = torch.cuda.is_available()
torch.cuda.empty_cache()
device = torch.device("cuda" if use_cuda else "cpu")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3], dim=0)
if use_cuda:
    model = model.cuda()

In [22]:
print('vocabulary size: %d, max sequence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))

vocabulary size: 50265, max sequence length: 512


In [39]:
inputs = tokenizer(train_sent[0], return_tensors="pt")
print(inputs)

{'input_ids': tensor([[    0,   242,  5457, 41949,   877,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [40]:
tokenizer.convert_ids_to_tokens([    0,   242,  5457, 41949,   877,     2])

['<s>', 'e', 'Ġ=', 'Ġenumer', 'ate', '</s>']

In [25]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [26]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train_sent.txt',
    overwrite_cache=True,
    block_size=19)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test_sent.txt',
    overwrite_cache=True,
    block_size=19)



In [27]:
!set os.environ["WANDB_DISABLED"] = "true"

In [28]:
training_args = TrainingArguments(
    output_dir = '/scratch1/rgoli/gpt_model', 
    overwrite_output_dir = True, 
    per_device_train_batch_size = 64, 
    per_device_eval_batch_size = 64, 
    learning_rate = 5e-4, 
    num_train_epochs = 3,
)
# Initializing the trainer class object that will do the training
# here the data collator will generate the batch of size 64 of train and test data
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [29]:
# Training the model for 3 epochs
trainer.train()

***** Running training *****
  Num examples = 2425650
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 28428
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohangoli[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
500,4.4392
1000,4.2931
1500,4.276
2000,4.2707
2500,4.2625
3000,4.25
3500,4.2429
4000,4.2383
4500,4.2195
5000,4.2157


Saving model checkpoint to /scratch1/rgoli/gpt_model/checkpoint-500
Configuration saved in /scratch1/rgoli/gpt_model/checkpoint-500/config.json
Model weights saved in /scratch1/rgoli/gpt_model/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model/checkpoint-1000
Configuration saved in /scratch1/rgoli/gpt_model/checkpoint-1000/config.json
Model weights saved in /scratch1/rgoli/gpt_model/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model/checkpoint-1500
Configuration saved in /scratch1/rgoli/gpt_model/checkpoint-1500/config.json
Model weights saved in /scratch1/rgoli/gpt_model/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model/checkpoint-2000
Configuration saved in /scratch1/rgoli/gpt_model/checkpoint-2000/config.json
Model weights saved in /scratch1/rgoli/gpt_model/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model/checkpoint-2500
Configuration save

TrainOutput(global_step=28428, training_loss=4.174119551339128, metrics={'train_runtime': 11140.435, 'train_samples_per_second': 653.202, 'train_steps_per_second': 2.552, 'total_flos': 0.0, 'train_loss': 4.174119551339128, 'epoch': 3.0})

In [30]:
trainer.save_model('./saved')

Saving model checkpoint to ./saved
Configuration saved in ./saved/config.json
Model weights saved in ./saved/pytorch_model.bin


In [31]:
# Checkpoint save for each 3000 steps instead of 500 steps

# training_args = TrainingArguments(
#     output_dir='./saved',
#     overwrite_output_dir=True,
#     num_train_epochs=1,
#     per_device_train_batch_size=8,
#     logging_steps=3000,
#     save_steps=3000,
#     save_total_limit=2,
#     seed=1,
#     fp16=True
# )

# trainer.train()

# trainer.save_model('./saved')

## For more details https://discuss.huggingface.co/t/loading-model-from-checkpoint-after-error-in-training/758/4

In [32]:
## Multi-Device GPU training on PyTorch
# https://pythonmana.com/2021/07/20210707185846843x.html

#  Load model

# model = nn.DataParallel(model)
# model = model.cuda()

# Directly, of course device_ids It's fine too ：

# net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
# model = model.cuda()

# Load data

# inputs = inputs.cuda()
# labels = labels.cuda()

In [33]:
# Evaluating on Test data
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 603781
  Batch size = 256


{'eval_loss': 4.8302130699157715,
 'eval_runtime': 296.9818,
 'eval_samples_per_second': 2033.057,
 'eval_steps_per_second': 7.943,
 'epoch': 3.0}

In [35]:
generator = pipeline('text-generation', tokenizer='microsoft/codebert-base-mlm', model='saved')

loading configuration file saved/config.json
Model config OpenAIGPTConfig {
  "_name_or_path": "congcongwang/gpt2_medium_fine_tuned_coder",
  "activation_function": "gelu_new",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "vocab_size": 50260
}

loading configuration file saved/config.json
Model 

In [37]:
'''
generating  next word in 3 possible ways
1. Greedy Search : chooses the best possible next word based on highest probability from 1 hypothesis
2. Beam Search : chooses the high probability next word from n hypothesis
3. Random Sampling : chooses random next word from possible hypothesis , however as the temperature is set high , it will
   ignore low probability words.
'''

print(generator('print', max_length=5)[0]['generated_text'])
print(generator('print', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('print' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


print actresses Mueller nerv Mueller


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


print Maiden IntegNormally aloud
print Previously rising aloud assured


In [None]:
print(generator('print', max_length=5)[0]['generated_text'])
print(generator('print', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('print' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

In [38]:
print(generator('for i in', max_length=5)[0]['generated_text'])
print(generator('for i in', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('for i in' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in agg regards


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in202 plagued
for i in plagued Jonathan


In [None]:
def predict_next():
    from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,\
    TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling
    import re 
    from nltk.tokenize import word_tokenize
    
    
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    generator = pipeline('text-generation', tokenizer='openai-gpt', model='gpt_model') 
    while(True):
        text = input('Enter the text: ')
        length= len(tokenizer.encode(text, return_tensors='pt')[0])
        
        max_length = length+1
    
        print('Next Word: ')
        print(generator(text , max_length=max_length)[0]['generated_text'].split(' ')[-1])
        print(generator(text , max_length=max_length , num_beams = 5)[0]['generated_text'].split(' ')[-1])
        print(generator(text , max_length=max_length , do_sample=True,temperature = 0.7)[0]['generated_text'].split(' ')[-1])

In [None]:
predict_next()

In [31]:
import wandb
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [23]:
torch.cuda.empty_cache()

In [80]:
import gc
gc.collect()

1274

In [1]:
!nvidia-smi

Sat Nov 27 15:11:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1A:00.0 Off |                    0 |
| N/A   33C    P0    39W / 300W |     23MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1C:00.0 Off |                    0 |
| N/A   31C    P0    40W / 300W |     23MiB / 16160MiB |      0%      Default |
|       

In [27]:
# !nvidia-smi --gpu-reset