In [1]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [36]:
file_name = "Project_CodeNet_Python800.tar.gz"
data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# Download tar archive to local disk
with open(file_name, "wb") as f:
    f.write(requests.get(data_url).content)
    
# Extract contents of archive to local disk
if os.path.exists("data"):
    shutil.rmtree("data")    
with tarfile.open(file_name) as tfile:
    tfile.extractall()

In [4]:
# file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
# len(file_paths)

In [5]:
# file_paths[0:10]

In [6]:
lits = json.load(open("literals.json"))

def process_string(token, special_chars={" ": "U+0020", ",": "U+002C"}):
    str_quote_options = ["'''", '"""', "'", '"']
    start_quote = ""
    end_quote = ""
    qualifier_regex = r"^[a-z]+"
    qualifier_match = re.search(qualifier_regex, token)
    # string qualifiers like 'r' for regex, 'f' for formatted string, 'b' for bytes, 'u' for unicode, etc (or combination of them)
    qualifier = "" if not qualifier_match else qualifier_match[0]
    # token string without qualifiers
    token_string = re.sub(qualifier_regex, "", token)
    # string literal without quotes
    str_lit = token_string
    for q in str_quote_options:
        if token_string.startswith(q):
            start_quote = q
            str_lit = str_lit[len(q) :]
            if token_string.endswith(q):
                end_quote = q
                str_lit = str_lit[: -len(q)]
            break
    # if start_quote in str_quote_options[:2]:
    #     return ""
    for sc in special_chars:
        str_lit = str_lit.replace(sc, special_chars[sc])
    return (
        f"{qualifier}{start_quote}<STR_LIT:{str_lit}>{end_quote}"
        if str_lit in lits['str']
        else f"{qualifier}{start_quote}<STR_LIT>{end_quote}"
    )

def py_tokenize():
    file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
    wf = open(os.path.join(os.getcwd(), f"full_corpus.txt"), 'w')
    local_corpus = []
    for path in file_paths:
        try:
            code = open(path).read()
            token_gen = tokenize(BytesIO(bytes(code, "utf8")).readline)
            out_tokens = []
            prev_eol = False
            for toknum, tokval, _, _, _ in token_gen:
                tokval = " ".join(tokval.split())
                if toknum == STRING:
                    add_token = process_string(tokval)
                    out_tokens.append(add_token)
                    prev_eol = False
                elif toknum == NUMBER:
                    if tokval in lits['num']:
                        out_tokens.append(f"<NUM_LIT:{tokval}>")
                    else:
                        out_tokens.append(f"<NUM_LIT>")
                    prev_eol = False
                elif toknum in [NEWLINE, NL]:
                    if not prev_eol:
                        out_tokens.append("<EOL>")
                        prev_eol = True
                elif toknum in [COMMENT, INDENT, ENCODING, ENDMARKER] or len(tokval) == 0:
                    continue
                else:
                    out_tokens.append(tokval)
                    prev_eol = False
            if out_tokens[0] == "<EOL>":
                out_tokens = out_tokens[1:]
            if out_tokens[-1] == "<EOL>":
                out_tokens = out_tokens[:-1]
        except Exception:
            out_tokens = []
#         local_corpus.extend((" ".join(out_tokens)).split('<EOL>'))
#         out_tokens = ["<s>"] + out_tokens + ["</s>"]
        out = " ".join(out_tokens)
        local_corpus.append(out)
        wf.write(out+"\n")
    print(f"Full Corpus is done")
    wf.close()
    return local_corpus

def read_corpus():
    corpus = py_tokenize()
    full_corpus = ''.join(corpus)
    corpus_new = []
    for code in corpus:
        corpus_new.extend(code.split('<EOL>'))
        
    return pd.DataFrame(corpus_new)

In [7]:
# train_corpus = read_corpus()

In [8]:
# train_corpus[0:20]

In [9]:
corpus = open('full_corpus.txt', encoding='utf8').readlines()

In [12]:
corpus[0]

'e = enumerate <EOL> n , * a = map ( int , open ( <NUM_LIT:0> ) . read ( ) . split ( ) ) <EOL> d = [ <NUM_LIT:0> ] <EOL> for j , ( a , i ) in e ( sorted ( ( a , i ) for i , a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ) , d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1> , j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ] <EOL> print ( max ( d ) )\n'

In [13]:
# text_corpus = []
# for code in corpus:
#     text_corpus.extend(code.split(' <EOL> '))

In [14]:
# text_corpus[0:10]

In [15]:
train_sent = corpus[0:int(0.8*len(corpus))]
test_sent = corpus[int(0.8*len(corpus)):]

In [17]:
test_sent[0:2]

['N = int ( input ( ) ) <EOL> count = <NUM_LIT:0> <EOL> for i in range ( <NUM_LIT:1> , N + <NUM_LIT:1> ) : <EOL> a = str ( i ) <EOL> b = len ( a ) <EOL> if b % <NUM_LIT:2> != <NUM_LIT:0> : <EOL> count += <NUM_LIT:1> <EOL> else : <EOL> count += <NUM_LIT:0> <EOL> print ( count )\n',
 'import math <EOL> N = int ( input ( ) ) <EOL> print ( sum ( [ int ( math . log10 ( x ) ) % <NUM_LIT:2> == <NUM_LIT:0> for x in range ( <NUM_LIT:1> , N + <NUM_LIT:1> ) ] ) )\n']

In [18]:
import pickle
with open("train_sent_byCode.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(train_sent))
with open("test_sent_byCode.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(test_sent))

In [19]:
import torch
print (torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


In [20]:
torch.cuda.set_device(0)
print(torch.cuda.current_device())

0


In [21]:
from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling, RobertaTokenizer

In [41]:
# tokenizer = OpenAIGPTTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
# tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

from custom_tokenise import BPE_token
from pathlib import Path
import os

paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.py"))
# paths[1:5]
tokenizer = BPE_token()# train the tokenizer model
tokenizer.bpe_train(paths)

# # saving the tokenized data in our specified folder 
# save_path = 'tokenized_data'
# tokenizer.save_tokenizer(save_path)

TypeError: Can't convert <tokenizers.trainers.BpeTrainer object at 0x149c00211d90> to Sequence

In [27]:
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
use_cuda = torch.cuda.is_available()
torch.cuda.empty_cache()
device = torch.device("cuda" if use_cuda else "cpu")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3], dim=0)
if use_cuda:
    model = model.cuda()

In [29]:
print('vocabulary size: %d, max sequence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))

vocabulary size: 40478, max sequence length: 512


In [32]:
inputs = tokenizer(train_sent[0], return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  243,   303,   496,    16,  1209,   746,   295,     5,  1156,   290,
           247,   240,   269,   246,   303,  5126,   276,  1786,   240,  1189,
           276,   295, 16324,   279,  2321,   271,   286,   290,   275,   239,
          1456,   276,   275,   239,  4620,   276,   275,   275,   295,     5,
          1156,   290,   248,   303,   293,   295, 16324,   279,  2321,   271,
           286,   290,   294,   295,     5,  1156,   290,   562,   266,   240,
           276,   246,   240,   249,   275,   500,   243,   276,  9840,   276,
           276,   246,   240,   249,   275,   562,   249,   240,   246,   500,
           243,   276,   246,   275,   275,   293,   271,   271,   260,   295,
         16324,   279,  2321,   271,   277,   290,   294,   275,   271,   248,
           303,   293,   248,   293,   295, 16324,   279,  2321,   271,   286,
           290,   294,   306,   246,   269, 12491,   276,   247,   260,   266,
           260,   249,   260,   295, 1

In [33]:
tokenizer.convert_ids_to_tokens([  243,   303,   496,    16,  1209,   746,   295,     5,  1156,   290,
           247,   240,   269,   246,   303,  5126,   276,  1786,   240,  1189,
           276,   295, 16324,   279,  2321,   271,   286,   290,   275,   239,
          1456,   276,   275,   239,  4620,   276,   275,   275,   295,     5,
          1156,   290,   248,   303,   293,   295, 16324,   279,  2321,   271,
           286,   290,   294,   295,     5,  1156,   290,   562,   266,   240,
           276,   246,   240,   249,   275,   500,   243,   276,  9840,   276,
           276,   246,   240,   249,   275,   562,   249,   240,   246,   500,
           243,   276,   246,   275,   275,   293,   271,   271,   260,   295,
         16324,   279,  2321,   271,   277,   290,   294,   275,   271,   248,
           303,   293,   248,   293,   295, 16324,   279,  2321,   271,   286,
           290,   294,   306,   246,   269, 12491,   276,   247,   260,   266,
           260,   249,   260,   295, 16324,   279,  2321,   271,   277,   290,
           275,   294,   306,   293,  2942,   276,   248,   293,   265,   294,
           306,   246,   269, 12491,   276,   247,   260,   266,   306,   265,
           260,   249,   260,   295, 16324,   279,  2321,   271,   277,   290,
           275,   240,   248,   293,   265,   260,   295, 16324,   279,  2321,
           271,   277,   290,   294,   306,   246,   269, 12491,   276,   249,
           260,   265,   306,   295, 16324,   279,  2321,   271,   277,   290,
           275,   275,   562,   265,   500,  5855,   276,   295, 16324,   279,
          2321,   271,   277,   290,   240,   266,   306,   295, 16324,   279,
          2321,   271,   277,   290,   275,   294,   306,   293,   248,   293,
           266,   294,   306,   246,   269, 12491,   276,   249,   260,   266,
           275,   294,   295,     5,  1156,   290,  8230,   276,  2942,   276,
           248,   275,   275])

['e</w>',
 '=</w>',
 'en',
 'u',
 'mer',
 'ate</w>',
 '<</w>',
 'e',
 'ol</w>',
 '></w>',
 'n</w>',
 ',</w>',
 '*</w>',
 'a</w>',
 '=</w>',
 'map</w>',
 '(</w>',
 'int</w>',
 ',</w>',
 'open</w>',
 '(</w>',
 '<</w>',
 'num</w>',
 '_</w>',
 'lit</w>',
 ':</w>',
 '0</w>',
 '></w>',
 ')</w>',
 '.</w>',
 'read</w>',
 '(</w>',
 ')</w>',
 '.</w>',
 'split</w>',
 '(</w>',
 ')</w>',
 ')</w>',
 '<</w>',
 'e',
 'ol</w>',
 '></w>',
 'd</w>',
 '=</w>',
 '[</w>',
 '<</w>',
 'num</w>',
 '_</w>',
 'lit</w>',
 ':</w>',
 '0</w>',
 '></w>',
 ']</w>',
 '<</w>',
 'e',
 'ol</w>',
 '></w>',
 'for</w>',
 'j</w>',
 ',</w>',
 '(</w>',
 'a</w>',
 ',</w>',
 'i</w>',
 ')</w>',
 'in</w>',
 'e</w>',
 '(</w>',
 'sorted</w>',
 '(</w>',
 '(</w>',
 'a</w>',
 ',</w>',
 'i</w>',
 ')</w>',
 'for</w>',
 'i</w>',
 ',</w>',
 'a</w>',
 'in</w>',
 'e</w>',
 '(</w>',
 'a</w>',
 ')</w>',
 ')</w>',
 '[</w>',
 ':</w>',
 ':</w>',
 '-</w>',
 '<</w>',
 'num</w>',
 '_</w>',
 'lit</w>',
 ':</w>',
 '1</w>',
 '></w>',
 ']</w>',
 ')</w>',

In [25]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [26]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train_sent.txt',
    overwrite_cache=True,
    block_size=19)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test_sent.txt',
    overwrite_cache=True,
    block_size=19)



In [27]:
!set os.environ["WANDB_DISABLED"] = "true"

In [28]:
training_args = TrainingArguments(
    output_dir = 'gpt_model', 
    overwrite_output_dir = True, 
    per_device_train_batch_size = 64, 
    per_device_eval_batch_size = 64, 
    learning_rate = 5e-4, 
    num_train_epochs = 3,
)
# Initializing the trainer class object that will do the training
# here the data collator will generate the batch of size 64 of train and test data
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [None]:
# Training the model for 3 epochs
trainer.train()

***** Running training *****
  Num examples = 2425650
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 28428
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohangoli[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
500,4.4485
1000,4.3012


Saving model checkpoint to gpt_model/checkpoint-500
Configuration saved in gpt_model/checkpoint-500/config.json
Model weights saved in gpt_model/checkpoint-500/pytorch_model.bin
Saving model checkpoint to gpt_model/checkpoint-1000
Configuration saved in gpt_model/checkpoint-1000/config.json
Model weights saved in gpt_model/checkpoint-1000/pytorch_model.bin


In [31]:
import wandb
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [23]:
torch.cuda.empty_cache()

In [80]:
import gc
gc.collect()

1274

In [28]:
!nvidia-smi

Sat Nov 27 12:55:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:02:00.0 Off |                    0 |
| N/A   18C    P0    28W / 250W |  12159MiB / 12198MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [27]:
# !nvidia-smi --gpu-reset