In [1]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler


import nltk
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelWithLMHead

[nltk_data] Downloading package punkt to C:\Users\Rachel
[nltk_data]     Tan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
torch.cuda.is_available()

True

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

## Trying out just the pretrained model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# add the EOS token as PAD token to avoid warnings
model = AutoModelWithLMHead.from_pretrained("gpt2")



In [5]:
prefix = "" #without any fine tuning 
prompt = "The Home Team Science and Technology Agency is"
output_length = 200


def generate_text(prefix, prompt, output_length=300):
    inputs = tokenizer.encode( prefix + prompt,
                          return_tensors='pt',
                          max_length=512,
                          truncation=True)
    generated_ids = model.generate(inputs, 
                             max_length=output_length, 
                            #  diversity_penalty = 1.2,
                             temperature = 1.0,
                             do_sample = True,
                             no_repeat_ngram_size = 3,
                             num_beams=2,
                             min_length=20 )
    generated_text = tokenizer.decode(generated_ids[0])
    print(generated_text)

generate_text(prefix, prompt, output_length)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  next_indices = next_tokens // vocab_size


The Home Team Science and Technology Agency is developing a mobile phone app to help people find their way around the world. The app, which will be available to download as early as October, will be able to track your journey using GPS and GPS-enabled devices.

The app will allow users to track their journey using a smartphone or tablet, and will allow them to locate their destination in real-time. The device will be connected to an iPhone or iPad, which can be used to access the app. The Home Team will also be developing a companion app that users can use to track the route of their journey.
, which is available to purchase from the Apple App Store. The App Store will also offer a free version of the Home Team app for iPhone and iPad. The new app will be launched in early October. The home team will also continue to develop the app, and can provide feedback on the app in the coming months.<|endoftext|>


## Trying with fine tuning 

In [5]:
dataset = pd.read_csv('qa_articles.csv')[['body_basic']] #only 50 for now as a trial, will increase later on 
dataset.columns = ['text']

In [6]:
dataset_list = list(dataset['text'])

In [7]:
def tokenize_texts(data): 
    tokenizer.add_special_tokens({'bos_token': '<|startoftext|>', 'eos_token':'<|endoftext|>', 'pad_token': '<|pad|>'})
    model.resize_token_embeddings(len(tokenizer))
    input_text = tokenizer(data,
                          max_length=512,
                          truncation=True,
                          padding = 'max_length')
    input_text["labels"] = input_text["input_ids"].copy()
    return input_text

In [8]:
data_token = [tokenize_texts(data) for data in dataset_list]

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=data_token, eval_dataset=data_token)

In [11]:
trainer.train()

***** Running training *****
  Num examples = 57
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 24


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=24, training_loss=15.764266967773438, metrics={'train_runtime': 24.3663, 'train_samples_per_second': 7.018, 'train_steps_per_second': 0.985, 'total_flos': 44680937472000.0, 'train_loss': 15.764266967773438, 'epoch': 3.0})

In [12]:
trainer.save_model("./testgpt2")

Saving model checkpoint to ./testgpt2
Configuration saved in ./testgpt2\config.json
Model weights saved in ./testgpt2\pytorch_model.bin


In [13]:
model_trained = AutoModelWithLMHead.from_pretrained("./testgpt2")

loading configuration file ./testgpt2\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 50259
}

loading weights file ./testgpt2\pytorch_model.bin
All model checkpoint weights were used when init

In [36]:
def generate_text_2(prefix, prompt, output_length=300):
    tokenizer.add_special_tokens({'bos_token': '<|startoftext|>', 'eos_token':'<|endoftext|>', 'pad_token': '<|pad|>'})
    inputs = tokenizer.encode( prefix + prompt,
                          return_tensors='pt',
                          max_length=200,
                          truncation=True)
    generated_ids = model_trained.generate(inputs, 
                             max_length=output_length, 
                            #  diversity_penalty = 1.2,
                             temperature = 1.0,
                             do_sample =True ,
                             no_repeat_ngram_size =2,
                             num_beams=2,
                             min_length=100, 
                            pad_token_id= 50258, 
                            bos_token_id = 50256,
                            eos_token_id = 50257)
    generated_text = tokenizer.decode(generated_ids[0])
    print(generated_text)

In [38]:
torch.manual_seed(88)
prefix = "" #without any fine tuning 
prompt = "<|startoftext|> A question and answering system is"
output_length = 200

generate_text_2(prefix, prompt, output_length) #it has been trained but it is crap 

Assigning <|startoftext|> to the bos_token key of the tokenizer
Assigning <|endoftext|> to the eos_token key of the tokenizer
Assigning <|pad|> to the pad_token key of the tokenizer


<|startoftext|> A question and answering system is one of the most difficult sciences. It consists of four parts: (a<|pad|> a a) The elements of The equations ofThe system consists<|pad|> of. Its contents are: A mathematical formula and the number of elements inThe formula consists Of the equationsof The system contains The numberOf elementsinThe numberof elementsInThe answer to the question, The answerto the questions, is a mathematical solution ofthe question,, isof the answerTo the, the answered to, by, and by.
<|startoftext|>


In [32]:
tokenizer

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'})

In [None]:
#still trying to figure out how to make this work 