In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/working/__notebook__.ipynb


In [2]:
from torchtext.datasets import WikiText2
from torchtext.data import Field, LabelField, Dataset, Pipeline, BPTTIterator, BucketIterator, TabularDataset, Example
from transformers import GPT2Tokenizer
from transformers import Trainer
import torch
from transformers import GPT2LMHeadModel
import torch.optim as optim
from spacy.lang.en import English



In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token='<|endoftext|>', unk_token='<|endoftext|>')
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [5]:
model_max_input = tokenizer.max_model_input_sizes['gpt2']
print(model_max_input)

1024


In [6]:
FILE_NAME = "/kaggle/input/adventure_of_sherlock_homles.txt"

In [7]:
fp = open(FILE_NAME, "r", encoding='utf8')
text = fp.read(-1)
fp.close()

In [8]:
new_text = text.replace("\n",' ')
new_text = " ".join(new_text.split())

In [9]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(new_text)
lines = [sent.string.strip() for sent in doc.sents]

In [10]:
def attention_mask(tokens):
    return [1]*len(tokens)

In [11]:
text_field = Field(use_vocab=False, pad_token=tokenizer.pad_token_id, init_token=None, eos_token=None, 
                   unk_token=tokenizer.unk_token_id, tokenize=tokenizer.tokenize, preprocessing=tokenizer.convert_tokens_to_ids, 
                   batch_first=True)

attn_mask_field = Field(use_vocab=False, pad_token=0, init_token=None, eos_token=None, 
                   unk_token=1, tokenize=tokenizer.tokenize, preprocessing=attention_mask, 
                   batch_first=True)


In [12]:
examples = []
for line in lines:
    ex = Example.fromlist([line, line], [("text", text_field),("mask", attn_mask_field)])
    examples.append(ex)

len(examples)

6899

In [13]:
ds = Dataset(examples, [("text", text_field), ("mask", attn_mask_field)])

In [14]:
tr, te = ds.split(split_ratio=0.9)
print("Training samples ", len(tr))
print("Testing samples ", len(te))

Training samples  6209
Testing samples  690


In [15]:
batch_size = 10
tr_i, te_i = BucketIterator.splits(datasets=(tr, te), batch_sizes=(batch_size,batch_size), 
                                         sort_key=lambda ex: len(ex.text), device=device, shuffle=True)

In [16]:
optimizer = optim.AdamW(model.parameters(), lr=5e-06)

In [17]:
NUM_EPOCHS = 4
EVAL_AFTER_EPOCHS = 1
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    model.train()
    for batch in tr_i:
        optimizer.zero_grad()
        labels = batch.text.clone().detach()
        labels[labels==tokenizer.pad_token_id] = -100
        outputs = model(input_ids=batch.text, attention_mask=batch.mask, labels=labels)
        loss, logits, past = outputs
        epoch_loss += loss.detach()
        loss.backward()
        optimizer.step()
    print("Epoch %d Total Loss %f, Avg Loss %f" % (epoch+1, epoch_loss, epoch_loss/len(tr_i)))
    if ((epoch+1)%EVAL_AFTER_EPOCHS==0):
        total_eval_loss = 0
        with torch.no_grad():
            model.eval()
            for batch in te_i:
                labels = batch.text.clone().detach()
                labels[labels==tokenizer.pad_token_id] = -100                
                eval_outputs = model(input_ids=batch.text, attention_mask=batch.mask, labels=labels)
                eval_loss, eval_logits, eval_past = eval_outputs
                total_eval_loss += eval_loss.detach()        
            print("Eval %d Total Loss %f, Avg Loss %f" % (epoch+1, total_eval_loss, total_eval_loss/len(te_i)))

Epoch 1 Total Loss 2240.918213, Avg Loss 3.608564
Eval 1 Total Loss 225.292267, Avg Loss 3.265105
Epoch 2 Total Loss 2049.608398, Avg Loss 3.300497
Eval 2 Total Loss 218.983215, Avg Loss 3.173670
Epoch 3 Total Loss 1981.357788, Avg Loss 3.190592
Eval 3 Total Loss 215.506226, Avg Loss 3.123279
Epoch 4 Total Loss 1935.522461, Avg Loss 3.116783
Eval 4 Total Loss 213.776321, Avg Loss 3.098208


In [18]:
model.save_pretrained("/kaggle/working/")
tokenizer.save_pretrained("/kaggle/working/")

('/kaggle/working/vocab.json',
 '/kaggle/working/merges.txt',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/added_tokens.json')