In [1]:
# Import Packages
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast, GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import TensorDataset, DataLoader

# specify GPU
device = torch.device("cuda")

# Import from external file
X_train = pd.read_csv("output/x_train.csv")
X_test = pd.read_csv("output/x_test.csv")

In [9]:
from tokenizers import CharBPETokenizer
tokenizer = CharBPETokenizer(bert_normalizer=False)
save_path = 'tokenized_data/train.txt'

# Save files
X_train['prep_answer'].to_csv(save_path, index=False, header=False)
#X_test[['question', 'prep_answer', 'cluster']].to_csv(save_path, index=False, header=False)

# train the tokenizer model
tokenizer.train(save_path)
# saving the tokenized data in our specified folder
tokenizer.save_model('tokenized_data')

['tokenized_data\\vocab.json', 'tokenized_data\\merges.txt']

In [None]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
import tensorflow as tf

# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id='<s>',
  eos_token_id='</s>'
)
# creating the model
model = TFGPT2LMHeadModel(config)

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 1
#history = model.fit(dataset, epochs=num_epoch)

In [4]:
#test = encode_qa(X_train['answer'].iloc[0:1], X_train['answer'].iloc[0:1], tokenizer)
test = tokenizer.encode(X_train['question'][0], return_tensors='pt')

In [5]:
test2 = tokenizer.decode(model.generate(test.cuda(), do_sample=True,
    max_length=100,
    top_k=50)[0],skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [6]:
#inspired by
#https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py
#https://gist.github.com/cdpierse/3ad19852efa7324cc16f4d83b9191176#file-script_dataset-py

class Data():

  def __init__(self, tokenizer, df, block_size=512):

    block_size = block_size - (
        tokenizer.max_len - tokenizer.max_len_single_sentence
    )

    tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df))

    self.examples = []
    for i in range(0, len(tokenized_text) - block_size +1, block_size):

      self.examples.append(
          tokenized_text[i : i + block_size] + [tokenizer.eos_token_id]
      )

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item], dtype=torch.long)

In [7]:
#valid = Data(tokenizer," ".join(data[:800]))
#train = Data(tokenizer," ".join(data[800:]))

batch_size = 1

#train_loader= DataLoader(train,shuffle=False,batch_size=batch_size,)
#valid_loader =DataLoader(valid,shuffle=False,batch_size=batch_size,)

In [8]:
epochs = 10
learning_rate = 0.0002
warmup_steps = 2000


optimizer = AdamW(model.parameters(), lr=learning_rate)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)

In [9]:
model.train()
tokenizer.pad_token = tokenizer.eos_token
encoding = tokenizer(list(X_train['prep_answer'].iloc[:50]), return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [10]:
encoding = tokenizer(list(X_train['question'].iloc[:50]), return_tensors='pt', padding=True, truncation=True)
q_input_ids = encoding['input_ids']
q_attention_mask = encoding['attention_mask']

In [11]:
import gc
gc.collect()

outputs = model(input_ids.cuda(), attention_mask=attention_mask.cuda(), labels=input_ids.cuda())
loss = outputs.loss
loss.backward()
optimizer.step()

RuntimeError: CUDA out of memory. Tried to allocate 2.34 GiB (GPU 0; 4.00 GiB total capacity; 1.98 GiB already allocated; 914.29 MiB free; 2.03 GiB reserved in total by PyTorch)

In [None]:
#test = encode_qa(X_train['answer'].iloc[0:1], X_train['answer'].iloc[0:1], tokenizer)
test3 = tokenizer.encode(X_train['question'][0], return_tensors='pt')

In [None]:
test4 = tokenizer.decode(model.generate(test3.cuda(), do_sample=True,
    max_length=100,
    top_k=50)[0],skip_special_tokens=True)

In [None]:
model.train()
for epoch in range(epochs):

  print(f"Epoch {epoch} started")

  sum_loss = 0
  val_loss = 0
  model.train()
  for data in X_train:

    output = model(data.cuda(), labels= data.cuda())


    loss, logits = output[:2]
    loss.backward()

    sum_loss += loss.item()
    optimizer.step()
    #scheduler.step()
    optimizer.zero_grad()


  # model.eval()
  # for data in valid_loader:

  #   output = model(data.cuda(), labels= data.cuda())


  #   loss, logits = output[:2]

  #   val_loss += loss.item()

  print(f"training loss { sum_loss / len(X_train)}")
