In [None]:
"""
Loads a pretrained GPT-2 model and finetunes it to work for generating Python code.
1. Download and load a pretrained version of GPT-2 small.
2. Load data from the './data' directory.
3. Clean the data and use the GPT-2 tokenizer to prepare the data for training.
4. Split the data into training and validation sets.
5. Train the model in a semi-supervised fashion with the following tasks:
   - Feed the model a portion of the code for an example and have it predict the next token.
6. Evaluate the model on the next token generation task, and report the metrics.
"""

import os
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import dataset
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from dataset import CodeDataset
import glob


In [None]:
# Initialization

# Set the random seed for reproducibility.
torch.manual_seed(42)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

CodeDataset.init(tokenizer, model)

In [None]:
# Load the data from the './data' directory.
# The data directory contains many more folders that contain the target python files
# and the corresponding code snippets.
data_files = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.py'):
            data_files.append(os.path.join(root, file))

data = data_files
'Data len: {}'.format(len(data))

In [None]:
# Create the dataset
dataset = CodeDataset(data)

# Split the dataset into training and validation sets.
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create a dataloader for the training and validation sets.
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)


In [None]:
for i, data in enumerate(train_dataloader):
    print(data)
    break


In [None]:
# Define the loss function and the optimizer.
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")
model.train()


In [None]:

# Get the latest file in aphabetical order in ./data directory
if not os.path.exists('./checkpoints'):
    os.makedirs('./checkpoints')

file_list = glob.glob('./checkpoints/*')
print(file_list)

checkpoint_path = ''
_epoch, _i = 0, -1
index = 0

if len(file_list) > 0:
    checkpoint_path = max(file_list, key=os.path.getctime)
    print(checkpoint_path)

    _base_name = os.path.basename(checkpoint_path)
    _wihout_ext = os.path.splitext(_base_name)[0]
    _tmp_args = _wihout_ext.split('-')
    _epoch, _i = int(_tmp_args[0]), int(_tmp_args[1])

    model.load_state_dict(torch.load(checkpoint_path))
    print('Loaded Epoch {}s i {} checkpoint from {}'.format(_epoch, _i, checkpoint_path))
    index = _i + 1


In [None]:
for epoch in range(_epoch, 10):
    for i, batch in enumerate(train_dataloader, start=index):
        optimizer.zero_grad()
        batch = batch[:, :1000]
        output = model(batch, labels=batch)
        loss = output[0]

        loss.backward()
        optimizer.step()

        # Save the model every 10 batche
        if i % 10 == 0:
            torch.save(model.state_dict(), './checkpoints/{}-{}.pt'.format(epoch, i))

        print('Epoch: {}/{}'.format(epoch, 10),
                'Step: {}/{}'.format(i, len(train_dataloader)),
                'Loss: {}'.format(loss.item()))


In [None]:
model.eval()


In [None]:
# Test the model
for i, batch in enumerate(test_dataloader):
    batch = batch[:, :1000]
    output = model(batch, labels=batch)
    loss = output[0]
    print('Epoch: {}/{}, Test Loss: {}'.format(epoch, 10, loss.item()))



In [None]:
if not os.path.exists('./out'):
    os.mkdir('./out')
torch.save(model.state_dict(), './out/gpt2_code_generator.pt')


In [None]:
dataset.generate_code("import torch\n", length=100, temperature=0.7, top_k=50)
