In [1]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [3]:
# file_name = "Project_CodeNet_Python800.tar.gz"
# data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# # Download tar archive to local disk
# with open(file_name, "wb") as f:
#     f.write(requests.get(data_url).content)
    
# # Extract contents of archive to local disk
# if os.path.exists("data"):
#     shutil.rmtree("data")    
# with tarfile.open(file_name) as tfile:
#     tfile.extractall()

In [4]:
# file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
# len(file_paths)

In [5]:
# file_paths[0:10]

In [6]:
lits = json.load(open("literals.json"))

def process_string(token, special_chars={" ": "U+0020", ",": "U+002C"}):
    str_quote_options = ["'''", '"""', "'", '"']
    start_quote = ""
    end_quote = ""
    qualifier_regex = r"^[a-z]+"
    qualifier_match = re.search(qualifier_regex, token)
    # string qualifiers like 'r' for regex, 'f' for formatted string, 'b' for bytes, 'u' for unicode, etc (or combination of them)
    qualifier = "" if not qualifier_match else qualifier_match[0]
    # token string without qualifiers
    token_string = re.sub(qualifier_regex, "", token)
    # string literal without quotes
    str_lit = token_string
    for q in str_quote_options:
        if token_string.startswith(q):
            start_quote = q
            str_lit = str_lit[len(q) :]
            if token_string.endswith(q):
                end_quote = q
                str_lit = str_lit[: -len(q)]
            break
    # if start_quote in str_quote_options[:2]:
    #     return ""
    for sc in special_chars:
        str_lit = str_lit.replace(sc, special_chars[sc])
    return (
        f"{qualifier}{start_quote}<STR_LIT:{str_lit}>{end_quote}"
        if str_lit in lits['str']
        else f"{qualifier}{start_quote}<STR_LIT>{end_quote}"
    )

def py_tokenize():
    file_paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.*"))
    wf = open(os.path.join(os.getcwd(), f"full_corpus.txt"), 'w')
    local_corpus = []
    for path in file_paths:
        try:
            code = open(path).read()
            token_gen = tokenize(BytesIO(bytes(code, "utf8")).readline)
            out_tokens = []
            prev_eol = False
            for toknum, tokval, _, _, _ in token_gen:
                tokval = " ".join(tokval.split())
                if toknum == STRING:
                    add_token = process_string(tokval)
                    out_tokens.append(add_token)
                    prev_eol = False
                elif toknum == NUMBER:
                    if tokval in lits['num']:
                        out_tokens.append(f"<NUM_LIT:{tokval}>")
                    else:
                        out_tokens.append(f"<NUM_LIT>")
                    prev_eol = False
                elif toknum in [NEWLINE, NL]:
                    if not prev_eol:
                        out_tokens.append("<EOL>")
                        prev_eol = True
                elif toknum in [COMMENT, INDENT, ENCODING, ENDMARKER] or len(tokval) == 0:
                    continue
                else:
                    out_tokens.append(tokval)
                    prev_eol = False
            if out_tokens[0] == "<EOL>":
                out_tokens = out_tokens[1:]
            if out_tokens[-1] == "<EOL>":
                out_tokens = out_tokens[:-1]
        except Exception:
            out_tokens = []
#         local_corpus.extend((" ".join(out_tokens)).split('<EOL>'))
#         out_tokens = ["<s>"] + out_tokens + ["</s>"]
        out = " ".join(out_tokens)
        local_corpus.append(out)
        wf.write(out+"\n")
    print(f"Full Corpus is done")
    wf.close()
    return local_corpus

def read_corpus():
    corpus = py_tokenize()
    full_corpus = ''.join(corpus)
    corpus_new = []
    for code in corpus:
        corpus_new.extend(code.split('<EOL>'))
        
    return pd.DataFrame(corpus_new)

In [7]:
# train_corpus = read_corpus()

In [8]:
# train_corpus[0:20]

In [9]:
corpus = open('full_corpus.txt', encoding='utf8').readlines()

In [10]:
corpus[0]

'e = enumerate <EOL> n , * a = map ( int , open ( <NUM_LIT:0> ) . read ( ) . split ( ) ) <EOL> d = [ <NUM_LIT:0> ] <EOL> for j , ( a , i ) in e ( sorted ( ( a , i ) for i , a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ) , d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1> , j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ] <EOL> print ( max ( d ) )\n'

In [11]:
# text_corpus = []
# for code in corpus:
#     text_corpus.extend(code.split(' <EOL> '))

In [12]:
# text_corpus[0:10]

In [13]:
train_sent = corpus[0:int(0.8*len(corpus))]
test_sent = corpus[int(0.8*len(corpus)):]

In [14]:
test_sent[0:2]

['N = int ( input ( ) ) <EOL> count = <NUM_LIT:0> <EOL> for i in range ( <NUM_LIT:1> , N + <NUM_LIT:1> ) : <EOL> a = str ( i ) <EOL> b = len ( a ) <EOL> if b % <NUM_LIT:2> != <NUM_LIT:0> : <EOL> count += <NUM_LIT:1> <EOL> else : <EOL> count += <NUM_LIT:0> <EOL> print ( count )\n',
 'import math <EOL> N = int ( input ( ) ) <EOL> print ( sum ( [ int ( math . log10 ( x ) ) % <NUM_LIT:2> == <NUM_LIT:0> for x in range ( <NUM_LIT:1> , N + <NUM_LIT:1> ) ] ) )\n']

In [15]:
import pickle
with open("train_sent_byCode.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(train_sent))
with open("test_sent_byCode.txt", "w") as fp:   #Pickling
    fp.write('\n'.join(test_sent))

In [16]:
import torch
print (torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


In [17]:
torch.cuda.set_device(0)
print(torch.cuda.current_device())

0


In [18]:
from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling, RobertaTokenizer, GPT2Tokenizer, GPT2LMHeadModel

In [22]:
# tokenizer = OpenAIGPTTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
# tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained('/scratch1/rgoli/gpt_model_nov28/checkpoint-6000/',local_files_only=True)

# from custom_tokenise import BPE_token
# from pathlib import Path
# import os

# paths = glob.glob(os.path.join(os.getcwd(),"Project_CodeNet_Python800/**/*.py"))
# # paths[1:5]
# tokenizer = BPE_token()# train the tokenizer model
# tokenizer.bpe_train(paths)

# # # saving the tokenized data in our specified folder 
# # save_path = 'tokenized_data'
# # tokenizer.save_tokenizer(save_path)

In [24]:
# model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

# model = GPT2LMHeadModel.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('/scratch1/rgoli/gpt_model_nov28/checkpoint-6000/',local_files_only=True)

Some weights of the model checkpoint at /scratch1/rgoli/gpt_model_nov28/checkpoint-6000/ were not used when initializing GPT2LMHeadModel: ['module.transformer.h.1.ln_2.weight', 'module.transformer.h.11.ln_1.bias', 'module.transformer.h.1.attn.c_proj.weight', 'module.transformer.h.4.attn.c_attn.weight', 'module.transformer.h.6.mlp.c_proj.bias', 'module.transformer.ln_f.weight', 'module.transformer.h.8.attn.c_attn.weight', 'module.transformer.h.10.ln_1.bias', 'module.transformer.h.4.attn.bias', 'module.transformer.h.4.attn.c_attn.bias', 'module.transformer.h.10.attn.c_attn.weight', 'module.transformer.h.9.ln_2.weight', 'module.transformer.h.7.attn.c_attn.weight', 'module.transformer.h.11.mlp.c_fc.weight', 'module.transformer.h.2.ln_1.bias', 'module.transformer.h.3.attn.c_attn.weight', 'module.transformer.h.8.ln_1.weight', 'module.transformer.h.6.attn.c_attn.bias', 'module.transformer.h.6.mlp.c_fc.bias', 'module.transformer.h.9.mlp.c_fc.bias', 'module.transformer.h.7.mlp.c_proj.bias', 'mo

In [25]:
use_cuda = torch.cuda.is_available()
torch.cuda.empty_cache()
device = torch.device("cuda" if use_cuda else "cpu")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3], dim=0)
if use_cuda:
    model = model.cuda()

In [26]:
print('vocabulary size: %d, max sequence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))

vocabulary size: 50257, max sequence length: 1024


In [27]:
inputs = tokenizer(train_sent[0], return_tensors="pt")
print(inputs)

{'input_ids': tensor([[   68,   796, 27056,   378,  1279,    36,  3535,    29,   299,   837,
          1635,   257,   796,  3975,   357,   493,   837,  1280,   357,  1279,
         41359,    62,    43,  2043,    25,    15,    29,  1267,   764,  1100,
           357,  1267,   764,  6626,   357,  1267,  1267,  1279,    36,  3535,
            29,   288,   796,   685,  1279, 41359,    62,    43,  2043,    25,
            15,    29,  2361,  1279,    36,  3535,    29,   329,   474,   837,
           357,   257,   837,  1312,  1267,   287,   304,   357, 23243,   357,
           357,   257,   837,  1312,  1267,   329,  1312,   837,   257,   287,
           304,   357,   257,  1267,  1267,   685,  1058,  1058,   532,  1279,
         41359,    62,    43,  2043,    25,    16,    29,  2361,  1267,  1058,
           288,   796,   685,   288,   685,  1279, 41359,    62,    43,  2043,
            25,    15,    29,  2361,  1343,   257,  1635,  2352,   357,   299,
           532,   474,   532,  1312,  

In [28]:
print(tokenizer.convert_ids_to_tokens([   68,   796, 27056,   378,  1279,    36,  3535,    29,   299,   837,
          1635,   257,   796,  3975,   357,   493,   837,  1280,   357,  1279,
         41359,    62,    43,  2043,    25,    15,    29,  1267,   764,  1100,
           357,  1267,   764,  6626,   357,  1267,  1267,  1279,    36,  3535,
            29,   288,   796,   685,  1279, 41359,    62,    43,  2043,    25,
            15,    29,  2361,  1279,    36,  3535,    29,   329,   474,   837,
           357,   257,   837,  1312,  1267,   287,   304,   357, 23243,   357,
           357,   257,   837,  1312,  1267,   329,  1312,   837,   257,   287,
           304,   357,   257,  1267,  1267,   685,  1058,  1058,   532,  1279,
         41359,    62,    43,  2043,    25,    16,    29,  2361,  1267,  1058,
           288,   796,   685,   288,   685,  1279, 41359,    62,    43,  2043,
            25,    15,    29,  2361,  1343,   257,  1635,  2352,   357,   299,
           532,   474,   532,  1312,   532,  1279, 41359,    62,    43,  2043,
            25,    16,    29,  1267,  2361,  1343,   685,  3509,   357,   288,
           685,   479,  2361,  1343,   257,  1635,  2352,   357,   299,   532,
           474,  1343,   479,   532,  1312,   532,  1279, 41359,    62,    43,
          2043,    25,    16,    29,  1267,   837,   288,   685,   479,   532,
          1279, 41359,    62,    43,  2043,    25,    16,    29,  2361,  1343,
           257,  1635,  2352,   357,  1312,   532,   479,  1343,  1279, 41359,
            62,    43,  2043,    25,    16,    29,  1267,  1267,   329,   479,
           287,  2837,   357,  1279, 41359,    62,    43,  2043,    25,    16,
            29,   837,   474,  1343,  1279, 41359,    62,    43,  2043,    25,
            16,    29,  1267,  2361,  1343,   685,   288,   685,   474,  2361,
          1343,   257,  1635,  2352,   357,  1312,   532,   474,  1267,  2361,
          1279,    36,  3535,    29,  3601,   357,  3509,   357,   288,  1267,
          1267,   198]))

['e', 'Ġ=', 'Ġenumer', 'ate', 'Ġ<', 'E', 'OL', '>', 'Ġn', 'Ġ,', 'Ġ*', 'Ġa', 'Ġ=', 'Ġmap', 'Ġ(', 'Ġint', 'Ġ,', 'Ġopen', 'Ġ(', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '0', '>', 'Ġ)', 'Ġ.', 'Ġread', 'Ġ(', 'Ġ)', 'Ġ.', 'Ġsplit', 'Ġ(', 'Ġ)', 'Ġ)', 'Ġ<', 'E', 'OL', '>', 'Ġd', 'Ġ=', 'Ġ[', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '0', '>', 'Ġ]', 'Ġ<', 'E', 'OL', '>', 'Ġfor', 'Ġj', 'Ġ,', 'Ġ(', 'Ġa', 'Ġ,', 'Ġi', 'Ġ)', 'Ġin', 'Ġe', 'Ġ(', 'Ġsorted', 'Ġ(', 'Ġ(', 'Ġa', 'Ġ,', 'Ġi', 'Ġ)', 'Ġfor', 'Ġi', 'Ġ,', 'Ġa', 'Ġin', 'Ġe', 'Ġ(', 'Ġa', 'Ġ)', 'Ġ)', 'Ġ[', 'Ġ:', 'Ġ:', 'Ġ-', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '1', '>', 'Ġ]', 'Ġ)', 'Ġ:', 'Ġd', 'Ġ=', 'Ġ[', 'Ġd', 'Ġ[', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '0', '>', 'Ġ]', 'Ġ+', 'Ġa', 'Ġ*', 'Ġabs', 'Ġ(', 'Ġn', 'Ġ-', 'Ġj', 'Ġ-', 'Ġi', 'Ġ-', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '1', '>', 'Ġ)', 'Ġ]', 'Ġ+', 'Ġ[', 'Ġmax', 'Ġ(', 'Ġd', 'Ġ[', 'Ġk', 'Ġ]', 'Ġ+', 'Ġa', 'Ġ*', 'Ġabs', 'Ġ(', 'Ġn', 'Ġ-', 'Ġj', 'Ġ+', 'Ġk', 'Ġ-', 'Ġi', 'Ġ-', 'Ġ<', 'NUM', '_', 'L', 'IT', ':', '1', '>', 'Ġ)', 'Ġ,

In [29]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [30]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train_sent.txt',
    overwrite_cache=True,
    block_size=128)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test_sent.txt',
    overwrite_cache=True,
    block_size=128)



In [31]:
!set os.environ["WANDB_DISABLED"] = "true"

In [32]:
training_args = TrainingArguments(
    output_dir = '/scratch1/rgoli/gpt_model_nov28', 
    overwrite_output_dir = True, 
    per_device_train_batch_size = 24, 
    per_device_eval_batch_size = 24, 
    learning_rate = 5e-4, 
    num_train_epochs = 3,
    save_steps=3000,
    logging_steps=3000,
    save_total_limit=2
)
# Initializing the trainer class object that will do the training
# here the data collator will generate the batch of size 64 of train and test data
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [33]:
# Training the model for 3 epochs
trainer.train()

***** Running training *****
  Num examples = 322156
  Num Epochs = 3
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 10068
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohangoli[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
3000,0.9912
6000,0.6364
9000,0.551


Saving model checkpoint to /scratch1/rgoli/gpt_model_nov28/checkpoint-3000
Configuration saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-3000/config.json
Model weights saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model_nov28/checkpoint-6000
Configuration saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-6000/config.json
Model weights saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to /scratch1/rgoli/gpt_model_nov28/checkpoint-9000
Configuration saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-9000/config.json
Model weights saved in /scratch1/rgoli/gpt_model_nov28/checkpoint-9000/pytorch_model.bin
Deleting older checkpoint [/scratch1/rgoli/gpt_model_nov28/checkpoint-3000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10068, training_loss=0.7045274263911651, metrics={'train_runtime': 3577.9642, 'train_samples_per_second': 270.117, 'train_steps_per_second': 2.814, 'total_flos': 0.0, 'train_loss': 0.7045274263911651, 'epoch': 3.0})

In [34]:
trainer.save_model('./saved_nov28')

Saving model checkpoint to ./saved_nov28
Configuration saved in ./saved_nov28/config.json
Model weights saved in ./saved_nov28/pytorch_model.bin


In [35]:
# Evaluating on Test data
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 80189
  Batch size = 96


{'eval_loss': 0.758241593837738,
 'eval_runtime': 125.4956,
 'eval_samples_per_second': 638.979,
 'eval_steps_per_second': 6.662,
 'epoch': 3.0}

In [39]:
generator = pipeline('text-generation', tokenizer=tokenizer, model='saved_nov28')

loading configuration file saved_nov28/config.json
Model config GPT2Config {
  "_name_or_path": "/scratch1/rgoli/gpt_model_nov28/checkpoint-6000/",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading configurati

In [40]:
'''
generating  next word in 3 possible ways
1. Greedy Search : chooses the best possible next word based on highest probability from 1 hypothesis
2. Beam Search : chooses the high probability next word from n hypothesis
3. Random Sampling : chooses random next word from possible hypothesis , however as the temperature is set high , it will
   ignore low probability words.
'''

print(generator('print', max_length=5)[0]['generated_text'])
print(generator('print', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('print' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


printESH slapping Carolhor
printhorFolderfarmfarm
print jammed Meanwhilehor cling


In [41]:
print(generator('for i in', max_length=5)[0]['generated_text'])
print(generator('for i in', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('for i in' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in entail negligible


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i initable cling
for i in After affiliate


In [60]:
import json

extra_tokens=json.loads(open('literals.json').read())['str']

extra_tokens.extend([
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
    "<EOL>",
    "<NUM_LIT>",
    "<STR_LIT>"
])

In [91]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./").glob("**/*byCode.txt")]

paths[0:2]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=2000, min_frequency=10, special_tokens=extra_tokens)

# Save files to disk
!mkdir -p code_tokenizer
tokenizer.save_model("code_tokenizer", "nov28")

['code_tokenizer/nov28-vocab.json', 'code_tokenizer/nov28-merges.txt']

In [99]:
# from custom_tokenise import BPE_token
# import os
# # the folder 'text' contains all the files
# paths = [str(x) for x in Path("./").glob("**/*byCode.txt")]
# tokenizer = BPE_token()
# # train the tokenizer model
# tokenizer.bpe_train(paths)
# # saving the tokenized data in our specified folder 
# # save_path = 'code_tokenizer'
# # # tokenizer.save_tokenizer(save_path)

# # tokenizer.save_tokenizer("code_tokenizer")
# # tokenizer = GPT2Tokenizer.from_pretrained("code_tokenizer")

In [100]:
# input_ids

In [108]:
from tokenizers.trainers import BpeTrainer
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    NFKC()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
# We initialize our trainer, giving him the details about the vocabulary we want to generate
trainer = BpeTrainer(vocab_size=2000, show_progress=True, initial_alphabet=ByteLevel.alphabet(),min_frequency=10, special_tokens=extra_tokens)
tokenizer.train(trainer=trainer, files=paths)

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

!mkdir -p code_tokenizer_bpe
tokenizer.model.save("code_tokenizer_bpe", None)
# tokenizer.save_vocabulary()
# tokenizer.save_pretrained()

Trained vocab size: 2000


['code_tokenizer_bpe/vocab.json', 'code_tokenizer_bpe/merges.txt']

In [104]:
# from custom_tokenise import BPE_token
# from pathlib import Path
# import os
# # the folder 'text' contains all the files
# paths = [str(x) for x in Path("./").glob("**/*byCode.txt")]
# tokenizer = BPE_token()
# # train the tokenizer model
# tokenizer.bpe_train(paths)
# # saving the tokenized data in our specified folder 
# save_path = 'tokenized_data'
# tokenizer.save_tokenizer(save_path)

In [75]:
# # tokenizer = Tokenizer.from_file("./code_tokenizer-vocab.json")
# tokenizer = ByteLevelBPETokenizer(
#     "code_tokenizer/nov28-vocab.json",
#     "code_tokenizer/nov28-merges.txt",
# )

In [109]:
tokenizer = GPT2Tokenizer.from_pretrained("code_tokenizer_bpe")

Didn't find file code_tokenizer_bpe/added_tokens.json. We won't load it.
Didn't find file code_tokenizer_bpe/special_tokens_map.json. We won't load it.
Didn't find file code_tokenizer_bpe/tokenizer_config.json. We won't load it.
Didn't find file code_tokenizer_bpe/tokenizer.json. We won't load it.
loading file code_tokenizer_bpe/vocab.json
loading file code_tokenizer_bpe/merges.txt
loading file None
loading file None
loading file None
loading file None
file code_tokenizer_bpe/config.json not found


In [112]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

Assigning </s> to the eos_token key of the tokenizer
Assigning <s> to the bos_token key of the tokenizer
Assigning <unk> to the unk_token key of the tokenizer
Assigning <pad> to the pad_token key of the tokenizer
Assigning <mask> to the mask_token key of the tokenizer


In [113]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [114]:
string_tokenized[0:3]

[87, 397, 767]

In [115]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [116]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [118]:
num_epoch = 10
history = model.fit(dataset, epochs=num_epoch)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


ResourceExhaustedError:  OOM when allocating tensor with shape[12,12,99,99] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node tfgp_t2lm_head_model/transformer/h_._5/attn/Softmax (defined at /home/rgoli/software/venv/tf1_gpu/lib/python3.7/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py:119) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_19220]

Errors may have originated from an input operation.
Input Source operations connected to node tfgp_t2lm_head_model/transformer/h_._5/attn/Softmax:
 tfgp_t2lm_head_model/transformer/h_._5/attn/sub_2 (defined at /home/rgoli/software/venv/tf1_gpu/lib/python3.7/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py:112)

Function call stack:
train_function
