In [None]:
! pip install transformers

In [None]:
import csv
import os
import argparse
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

class MyDataset(Dataset):
  def __init__(self, data_file_name, data_dir='.data/'):
    super().__init__()

    data_path = os.path.join(data_file_name)

    self.data_list = []
    self.end_of_text_token = " <|endoftext|> "
		
    with open(data_path) as data_file:
      lines = data_file.readlines()
      for line in lines:
        l = line.replace('\n', self.end_of_text_token)
        self.data_list.append(l)
		
  def __len__(self):
    return len(self.data_list)

  def __getitem__(self, item):
    return self.data_list[item]

def get_data_loader(data_file_name):
	dataset = MyDataset(data_file_name)
	data_loader = DataLoader(dataset, batch_size=1, shuffle=True)
	return data_loader

def train(epochs, data_loader, batch_size, tokenizer, model, device):	
	batch_counter = 0
	sum_loss = 0.0

	for epoch in range(epochs):
		print (f'Running {epoch+1} epoch')

		for idx, txt in enumerate(data_loader):
			txt = torch.tensor(tokenizer.encode(txt[0]))
			txt = txt.unsqueeze(0).to(device)
			outputs = model(txt, labels=txt)
			loss, _ = outputs[:2]
			loss.backward()
			sum_loss += loss.data

			if idx%batch_size==0:
				batch_counter += 1
				optimizer.step()
				scheduler.step()
				optimizer.zero_grad()
				model.zero_grad()

			if batch_counter == 10:
				print(f"Total Loss is {sum_loss}") #printed after every 10*batch_size
				batch_counter = 0
				sum_loss = 0.0

	return model

def save_model(model, name):
	"""
	Summary:
		Saving model to the Disk
	Parameters:
		model: Trained model object
		name: Name of the model to be saved
	"""
	print ("Saving model to Disk")
	torch.save(model.state_dict(), f"{name}.pt")
	return

def load_models():
	"""
	Summary:
		Loading Pre-trained model
	"""
	print ('Loading/Downloading GPT-2 Model')
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
	model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
	return tokenizer, model

In [None]:
gpt_tok, gpt_pre = load_models()


In [None]:
intent_data = get_data_loader('gpt_data.txt')

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

gpt_pre.to(device)

In [None]:
optimizer = AdamW(gpt_pre.parameters(), lr=3e-5)


In [None]:
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=300, num_training_steps=-1)

In [None]:
gpt_trained = train(3, intent_data, 8, gpt_tok, gpt_pre, device)

Running 1 epoch
Total Loss is 468.53094482421875
Total Loss is 312.11077880859375
Total Loss is 248.7689971923828
Total Loss is 222.0544891357422
Total Loss is 232.70681762695312
Total Loss is 208.8592529296875
Total Loss is 206.02508544921875
Total Loss is 229.0137176513672
Total Loss is 221.3388671875
Total Loss is 228.28688049316406
Total Loss is 215.26014709472656
Total Loss is 199.99774169921875
Total Loss is 229.53857421875
Total Loss is 222.13967895507812
Total Loss is 213.6986541748047
Total Loss is 216.7646484375
Total Loss is 227.72793579101562
Total Loss is 298.00518798828125
Total Loss is 265.0007629394531
Total Loss is 256.3049011230469
Total Loss is 237.97482299804688
Total Loss is 222.09034729003906
Total Loss is 226.44760131835938
Total Loss is 228.945556640625
Total Loss is 213.88426208496094
Total Loss is 224.8849334716797
Total Loss is 226.03961181640625
Total Loss is 222.86502075195312
Total Loss is 221.03744506835938
Total Loss is 232.37461853027344
Total Loss is 2

In [None]:
gpt_trained.save_weights('trained.model')