In [4]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [5]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
with open("train_sent.txt", "r") as fp: 
    train_sent = fp.read().splitlines() 
with open("test_sent.txt", "r") as fp:
    test_sent = fp.read().splitlines() 
with open("full_corpus.txt", "r") as fp:
    full_corpus = fp.read().splitlines() 

In [7]:
%timeit
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel,GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('code-tokenizer-scratch/',local_files_only=True)
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = TFGPT2LMHeadModel.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [8]:
import torch

In [9]:
%timeit
use_cuda = torch.cuda.is_available()
torch.cuda.empty_cache()
device = torch.device("cuda" if use_cuda else "cpu")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3], dim=0)
if use_cuda:
    model = model.cuda()

# device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],1: [8, 9, 10, 11, 12, 13, 14, 15], 2: [16, 17, 18, 19, 20, 21, 22, 23], 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
# model.parallelize(device_map)

In [10]:
print('vocabulary size: %d, max sequence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))

vocabulary size: 25000, max sequence length: 1000000000000000019884624838656


In [11]:
inputs = tokenizer(train_sent[0], return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 87, 306, 677]]), 'attention_mask': tensor([[1, 1, 1]])}


In [12]:
tokenizer(train_sent[0])

{'input_ids': [87, 306, 677], 'attention_mask': [1, 1, 1]}

In [14]:
tokenizer.convert_ids_to_tokens([87, 306, 677])

['e', 'Ġ=', 'Ġenumerate']

In [15]:
tokenizer.decode([87, 306, 677])

'e = enumerate'

In [16]:
from transformers import TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [18]:
%timeit
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train_sent.txt',
    overwrite_cache=True,
    block_size=24)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test_sent.txt',
    overwrite_cache=True,
    block_size=24)



In [19]:
%timeit
!set os.environ["WANDB_DISABLED"] = "true"

In [20]:
%timeit
training_args = TrainingArguments(
    output_dir = 'dec4_gpt', 
    overwrite_output_dir = True, 
#     per_device_train_batch_size = 64, 
#     per_device_eval_batch_size = 64, 
    learning_rate = 5e-4, 
    save_steps=1000,
    logging_steps=3000,
    save_total_limit=2,
    num_train_epochs = 1,
)

In [21]:
%timeit
# Initializing the trainer class object that will do the training
# here the data collator will generate the batch of size 64 of train and test data
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [22]:
!nvidia-smi

Tue Dec  7 00:24:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   27C    P0    35W / 250W |   1933MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
%timeit
trainer.train()

***** Running training *****
  Num examples = 1592077
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 199010
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohangoli[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss
3000,1.6935
6000,1.434
9000,1.3739
12000,1.3411
15000,1.3103
18000,1.2954
21000,1.2762
24000,1.263
27000,1.2461
30000,1.2404


Saving model checkpoint to dec4_gpt/checkpoint-1000
Configuration saved in dec4_gpt/checkpoint-1000/config.json
Model weights saved in dec4_gpt/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [dec4_gpt/checkpoint-133000] due to args.save_total_limit
Saving model checkpoint to dec4_gpt/checkpoint-2000
Configuration saved in dec4_gpt/checkpoint-2000/config.json
Model weights saved in dec4_gpt/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [dec4_gpt/checkpoint-134000] due to args.save_total_limit
Saving model checkpoint to dec4_gpt/checkpoint-3000
Configuration saved in dec4_gpt/checkpoint-3000/config.json
Model weights saved in dec4_gpt/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [dec4_gpt/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to dec4_gpt/checkpoint-4000
Configuration saved in dec4_gpt/checkpoint-4000/config.json
Model weights saved in dec4_gpt/checkpoint-4000/pytorch_model.bin
Deleting older checkpoint [dec4_gpt/ch

TrainOutput(global_step=199010, training_loss=1.1492351851135778, metrics={'train_runtime': 10021.905, 'train_samples_per_second': 158.86, 'train_steps_per_second': 19.858, 'total_flos': 1.9499860988928e+16, 'train_loss': 1.1492351851135778, 'epoch': 1.0})

In [24]:
trainer.save_model('./saved_dec4_gpt_c')

Saving model checkpoint to ./saved_dec4_gpt_c
Configuration saved in ./saved_dec4_gpt_c/config.json
Model weights saved in ./saved_dec4_gpt_c/pytorch_model.bin


In [26]:
# Evaluating on Test data
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 396534
  Batch size = 8


{'eval_loss': 1.239752173423767,
 'eval_runtime': 477.3567,
 'eval_samples_per_second': 830.687,
 'eval_steps_per_second': 103.836,
 'epoch': 1.0}

In [27]:
generator = pipeline('text-generation', tokenizer=tokenizer, model='saved_dec4_gpt_c')

loading configuration file saved_dec4_gpt_c/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading configuration file saved_dec4_gpt_c/config.json
Mo

In [28]:
print(generator('print', max_length=5)[0]['generated_text'])
print(generator('print', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('print' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  next_indices = next_tokens // vocab_size
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


print ( solve ( )
print ( ans )
print ( "<STR_


In [29]:
print(generator('for i in ', max_length=5)[0]['generated_text'])
print(generator('for i in ', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('for i in ' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in  :


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in  :
for i in  :


In [30]:
print(generator('import ', max_length=5)[0]['generated_text'])
print(generator('import ', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('import ' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import import sys


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import import sys
import fp


In [37]:
torch.cuda.empty_cache()

In [33]:
!nvidia-smi

Tue Dec  7 12:19:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   27C    P0    34W / 250W |   4715MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [55]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel,GPT2LMHeadModel
from transformers import TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling
import torch

# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('code-tokenizer-scratch/',local_files_only=True)
generator = pipeline('text-generation', tokenizer=tokenizer, model='saved_dec4_gpt_c')

print(generator('print', max_length=5)[0]['generated_text'])
print(generator('print', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('print' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

print(generator('for i in ', max_length=5)[0]['generated_text'])
print(generator('for i in ', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('for i in ' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

print(generator('import ', max_length=5)[0]['generated_text'])
print(generator('import ', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('import ' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])

Didn't find file code-tokenizer-scratch/added_tokens.json. We won't load it.
loading file code-tokenizer-scratch/vocab.json
loading file code-tokenizer-scratch/merges.txt
loading file None
loading file code-tokenizer-scratch/special_tokens_map.json
loading file code-tokenizer-scratch/tokenizer_config.json
loading file code-tokenizer-scratch/tokenizer.json
loading configuration file saved_dec4_gpt_c/config.json
Model config GPT2Config {
  "_name_or_path": "saved_dec4_gpt_c",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  

print ( '<STR_
print ( "<STR_


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


print ( "<STR_
for i in cle


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in ations


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


for i in ils
import ���


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import cessimport
import �クク


In [72]:
from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
import torch
from torch import nn

tokenizer = AutoTokenizer.from_pretrained('code-tokenizer-scratch/',local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('saved_dec4_gpt_c', local_files_only=True)

sequence = f"for i range"

inputs = tokenizer(sequence, return_tensors="pt")
input_ids = inputs["input_ids"]

# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]

# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)

# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)

generated = torch.cat([input_ids, next_token], dim=-1)

resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)


Didn't find file code-tokenizer-scratch/added_tokens.json. We won't load it.
loading file code-tokenizer-scratch/vocab.json
loading file code-tokenizer-scratch/merges.txt
loading file code-tokenizer-scratch/tokenizer.json
loading file None
loading file code-tokenizer-scratch/special_tokens_map.json
loading file code-tokenizer-scratch/tokenizer_config.json
loading configuration file saved_dec4_gpt_c/config.json
Model config GPT2Config {
  "_name_or_path": "saved_dec4_gpt_c",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  

for i range_
