In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/working/__notebook__.ipynb


In [2]:
from torchtext.datasets import WikiText2
from transformers import GPT2Tokenizer
from transformers import Trainer
import torch
from transformers import GPT2LMHeadModel, get_linear_schedule_with_warmup
import torch.optim as optim
from spacy.lang.en import English
import logging
import pickle
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, random_split



In [3]:
class TextData(Dataset):
    def __init__(
        self,
        tokenizer: GPT2Tokenizer,
        file_path: str,
        block_size=512,
        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path)

        block_size = block_size - (
            tokenizer.max_len - tokenizer.max_len_single_sentence
        )

        directory, filename = os.path.split(file_path)

        # change if args are added at later point
        cached_features_file = os.path.join(
            "/kaggle/working/", "gpt2" + "_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not overwrite_cache:
            logger.info(
                f"Loading features from your cached file {cached_features_file}"
            )
            with open(cached_features_file, "rb") as cache:
                self.examples = pickle.load(cache)
                logger.debug("Loaded examples from cache")
        else:
            logger.info(f"Creating features from file {filename} at {directory}")

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()
                #text = text.replace("\n",' ')
                #text = " ".join(text.split())                
                logger.debug("Succesfully read text from file")

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text) - block_size + 1, block_size):
                self.examples.append(
                    tokenizer.build_inputs_with_special_tokens(
                        tokenized_text[i : i + block_size]
                    )
                )

            logger.info(f"Saving features into cached file {cached_features_file}")
            with open(cached_features_file, "wb") as cache:
                pickle.dump(self.examples, cache, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger = logging.getLogger(__name__)

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token='<|endoftext|>', unk_token='<|endoftext|>')
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [6]:
model_max_input = tokenizer.max_model_input_sizes['gpt2']
print(model_max_input)

1024


In [7]:
FILE_NAME = "adventures_of_sherlock_holmes.txt"

In [8]:
dataset = TextData(tokenizer, file_path=FILE_NAME, overwrite_cache=False)
len_ds = len(dataset)

In [9]:
tr_len = int(len_ds*0.9)
te_len = len_ds - tr_len
tr_ds, te_ds = random_split(dataset,[tr_len, te_len] )
print("Training samples ", len(tr_ds))
print("Testing samples ", len(te_ds))

Training samples  279
Testing samples  32


In [10]:
tr_loader = DataLoader(tr_ds, batch_size=1, shuffle=True)
te_loader = DataLoader(te_ds, batch_size=1, shuffle=True)

In [11]:
optimizer = optim.AdamW(model.parameters(), lr=5e-05)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=-1)

In [12]:
NUM_EPOCHS = 4
EVAL_AFTER_EPOCHS = 1
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    model.train()
    for batch in tr_loader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch.to(device),labels=batch.to(device))
        loss, logits, past = outputs
        epoch_loss += loss.detach()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print("Epoch %d Total Loss %f, Avg Loss %f" % (epoch+1, epoch_loss, epoch_loss/len(tr_loader)))
    if ((epoch+1)%EVAL_AFTER_EPOCHS==0):
        total_eval_loss = 0
        with torch.no_grad():
            model.eval()
            for batch in te_loader:
                eval_outputs = model(input_ids=batch.to(device), labels=batch.to(device))
                eval_loss, eval_logits, eval_past = eval_outputs
                total_eval_loss += eval_loss.detach()        
            print("Eval %d Total Loss %f, Avg Loss %f" % (epoch+1, total_eval_loss, total_eval_loss/len(te_loader)))

Epoch 1 Total Loss 949.650208, Avg Loss 3.403764
Eval 1 Total Loss 95.724945, Avg Loss 2.991405
Epoch 2 Total Loss 852.889282, Avg Loss 3.056951
Eval 2 Total Loss 92.493584, Avg Loss 2.890424
Epoch 3 Total Loss 800.963989, Avg Loss 2.870839
Eval 3 Total Loss 91.619347, Avg Loss 2.863105
Epoch 4 Total Loss 757.031250, Avg Loss 2.713374
Eval 4 Total Loss 91.558655, Avg Loss 2.861208


In [13]:
model.save_pretrained("/kaggle/working/")
tokenizer.save_pretrained("/kaggle/working/")

('/kaggle/working/vocab.json',
 '/kaggle/working/merges.txt',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/added_tokens.json')