In [1]:
# Import requirements

import os
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import dataset
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from dataset import CodeDataset
import glob


In [2]:
# Initialization

# Set the random seed for reproducibility.
torch.manual_seed(42)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

CodeDataset.init(tokenizer, model)

In [3]:
# Load the data from the './data' directory.
# The data directory contains many more folders that contain the target python files
# and the corresponding code snippets.
data_files = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.py'):
            data_files.append(os.path.join(root, file))

data = data_files
'Data len: {}'.format(len(data))

'Data len: 150000'

In [4]:
# Create the dataset
dataset = CodeDataset(data)

# Split the dataset into training and validation sets.
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create a dataloader for the training and validation sets.
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)


In [5]:
for i, data in enumerate(train_dataloader):
    print(data)
    break


Token indices sequence length is longer than the specified maximum sequence length for this model (2500 > 1024). Running this sequence through the model will result in indexing errors


tensor([[37811,  2025,  2134,  ...,  6015,  3419,   198]])


In [6]:
# Define the loss function and the optimizer.
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [7]:
cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")
model.train()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [8]:

# Get the latest file in aphabetical order in ./data directory
if not os.path.exists('./checkpoints'):
    os.makedirs('./checkpoints')

file_list = glob.glob('./checkpoints/*')
print(file_list)

checkpoint_path = ''
_epoch, _i = 0, -1
index = 0

if len(file_list) > 0:
    checkpoint_path = max(file_list, key=os.path.getctime)
    print(checkpoint_path)

    _base_name = os.path.basename(checkpoint_path)
    _wihout_ext = os.path.splitext(_base_name)[0]
    _tmp_args = _wihout_ext.split('-')
    _epoch, _i = int(_tmp_args[0]), int(_tmp_args[1])

    model.load_state_dict(torch.load(checkpoint_path))
    print('Loaded Epoch {}s i {} checkpoint from {}'.format(_epoch, _i, checkpoint_path))
    index = _i + 1


['./checkpoints/0-0.pt']
./checkpoints/0-0.pt
Loaded Epoch 0s i 0 checkpoint from ./checkpoints/0-0.pt


In [9]:
for epoch in range(_epoch, 10):
    for i, batch in enumerate(train_dataloader, start=index):
        optimizer.zero_grad()
        batch = batch[:, :1000]
        output = model(batch, labels=batch)
        loss = output[0]

        loss.backward()
        optimizer.step()

        # Save the model every 10 batche
        if i % 10 == 0:
            torch.save(model.state_dict(), './checkpoints/{}-{}.pt'.format(epoch, i))

        print('Epoch: {}/{}'.format(epoch, 10),
                'Step: {}/{}'.format(i, len(train_dataloader)),
                'Loss: {}'.format(loss.item()))


Epoch: 0/10 Step: 1/135000 Loss: 2.7234344482421875
Epoch: 0/10 Step: 2/135000 Loss: 2.135730266571045
Epoch: 0/10 Step: 3/135000 Loss: 1.9999090433120728
Epoch: 0/10 Step: 4/135000 Loss: 3.5905206203460693
Epoch: 0/10 Step: 5/135000 Loss: 2.7974772453308105
Epoch: 0/10 Step: 6/135000 Loss: 2.3403780460357666
Epoch: 0/10 Step: 7/135000 Loss: 1.8329054117202759
Epoch: 0/10 Step: 8/135000 Loss: 4.88843297958374
Epoch: 0/10 Step: 9/135000 Loss: 2.6794021129608154
Epoch: 0/10 Step: 10/135000 Loss: 2.762396812438965
Epoch: 0/10 Step: 11/135000 Loss: 1.750596284866333
Epoch: 0/10 Step: 12/135000 Loss: 1.7422051429748535
Epoch: 0/10 Step: 13/135000 Loss: 1.3562179803848267
Epoch: 0/10 Step: 14/135000 Loss: 4.167167663574219
Epoch: 0/10 Step: 15/135000 Loss: 1.390671968460083
Epoch: 0/10 Step: 16/135000 Loss: 1.8474671840667725
Epoch: 0/10 Step: 17/135000 Loss: 3.7322046756744385
Epoch: 0/10 Step: 18/135000 Loss: 2.188779592514038
Epoch: 0/10 Step: 19/135000 Loss: 1.7991576194763184
Epoch: 0/1

KeyboardInterrupt: 

In [None]:
model.eval()


Error: Kernel is dead

In [None]:
# Test the model
for i, batch in enumerate(test_dataloader):
    batch = batch[:, :1000]
    output = model(batch, labels=batch)
    loss = output[0]
    print('Epoch: {}/{}, Test Loss: {}'.format(epoch, 10, loss.item()))



In [None]:
if not os.path.exists('./out'):
    os.mkdir('./out')
torch.save(model.state_dict(), './out/gpt2_code_generator.pt')


In [None]:
dataset.generate_code("import torch\n", length=100, temperature=0.7, top_k=50)
