In [None]:
# Mount Google drive
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read the data
import pandas as pd
path = '/content/drive/MyDrive/GPT-2/data/'
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

In [None]:
train_data.head()

Unnamed: 0,sentence1,sentence2
0,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...
1,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...
2,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...
3,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ..."
4,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...


In [None]:
test_data.head()

Unnamed: 0,sentence1,sentence2
0,From the merger of the Four Rivers Council and...,Shawnee Trails Council was formed from the mer...
1,Kathy and her husband Pete Beale ( Peter Dean ...,Kathy and her husband Peter Dean ( Pete Beale ...
2,Timora diarhoda is a species of moth of the No...,Diarhoda is a kind of moth of the Noctuidae fa...
3,Joe R. Campa Jr. is a former sailor of the Uni...,Joe R. Campa Jr. is a former U.S. Navy Matrose...
4,"The family moved to Camp Hill in 1972 , where ...","In 1972 , the family moved to Camp Hill , wher..."


In [None]:
train_data.shape

(5194, 2)

In [None]:
test_data.shape

(1770, 2)

**Feed the GPT-2**

Now to make our dataset ready to be fed to GPT-2, we combined our ‘sentence1’ column enclosed in < s > and < /s >, followed by the unique token of ‘»»’, and then the ‘sentence2’ column enclosed in < p > and < /p >. 

In [None]:
train_data['combined'] = '<s>' + train_data['sentence1'] + '</s>' + '>>>>' + '<p>' + train_data['sentence2'] + '</p>'

train_data.head()

Unnamed: 0,sentence1,sentence2,combined
0,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,<s>The NBA season of 1975 -- 76 was the 30th s...
1,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,<s>When comparable rates of flow can be mainta...
2,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,<s>It is the seat of Zerendi District in Akmol...
3,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",<s>William Henry Henry Harman was born on 17 F...
4,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,<s>With a discrete amount of probabilities For...


In [None]:
train_data['combined'][5]

'<s>He was a scholar in Metaphysical Literature , Theology and Classical sciences .</s>>>>><p>He was a scholar in metaphysical literature , theology , and classical science .</p>'

In [None]:
train_data['combined'].to_csv(path + 'train.txt', sep = '\n', index=False, header=None)

In [None]:
# Test data  
test_data['combined'] = '<s>' + test_data['sentence1'] + '</s>' + '>>>>' + '<p>' + test_data['sentence2'] + '</p>'

test_data.head()

Unnamed: 0,sentence1,sentence2,combined
0,From the merger of the Four Rivers Council and...,Shawnee Trails Council was formed from the mer...,<s>From the merger of the Four Rivers Council ...
1,Kathy and her husband Pete Beale ( Peter Dean ...,Kathy and her husband Peter Dean ( Pete Beale ...,<s>Kathy and her husband Pete Beale ( Peter De...
2,Timora diarhoda is a species of moth of the No...,Diarhoda is a kind of moth of the Noctuidae fa...,<s>Timora diarhoda is a species of moth of the...
3,Joe R. Campa Jr. is a former sailor of the Uni...,Joe R. Campa Jr. is a former U.S. Navy Matrose...,<s>Joe R. Campa Jr. is a former sailor of the ...
4,"The family moved to Camp Hill in 1972 , where ...","In 1972 , the family moved to Camp Hill , wher...","<s>The family moved to Camp Hill in 1972 , whe..."


In [None]:
test_data['combined'].to_csv(path + 'test.txt', sep = '\n', index=False, header=None)

**Fine tuning GPT-2**

In [None]:
!pip install transformers
!pip install wandb

Collecting transformers
  Downloading transformers-4.12.0-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.1 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_path = path + 'train.txt'
test_path = path + 'test.txt'

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=64)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=64)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer);



**Initialize Trainer with TrainingArguments and GPT-2 model**

In [None]:
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

out_dir = '/content/drive/MyDrive/GPT-2/model/'

training_args = TrainingArguments(
    report_to = 'wandb',           # To report loss & accuracy
    output_dir= out_dir + "gpt2_output", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    save_strategy = 'epoch',
    warmup_steps=50,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    logging_steps = 10
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

**Train and save the model**

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10916
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4095
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,4.6989
20,4.1689
30,3.9231
40,3.6317
50,3.2734
60,3.2957
70,3.2045
80,2.9253
90,2.8791
100,2.9835


Saving model checkpoint to /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-1365
Configuration saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-1365/config.json
Model weights saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-1365/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-2730
Configuration saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-2730/config.json
Model weights saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-2730/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-4095
Configuration saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-4095/config.json
Model weights saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/checkpoint-4095/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4095, training_loss=2.1005841362316238, metrics={'train_runtime': 4083.0721, 'train_samples_per_second': 8.02, 'train_steps_per_second': 1.003, 'total_flos': 3801636293050368.0, 'train_loss': 2.1005841362316238, 'epoch': 3.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/GPT-2/model/gpt2_output
Configuration saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/config.json
Model weights saved in /content/drive/MyDrive/GPT-2/model/gpt2_output/pytorch_model.bin


**Test the model**

In [None]:
from transformers import pipeline

gpt = pipeline('text-generation',model= out_dir + 'gpt2_output', tokenizer=tokenizer)

loading configuration file /content/drive/MyDrive/GPT-2/model/gpt2_output/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "t

We define a paraphraser function to manipulate the input according to our initial dataset manipulation.

In [None]:
def paraphrase(input_sequence):
  return gpt('<s>'+input_sequence+'</s>>>>><p>', max_length=100, do_sample=True, top_k=50, top_p=0.95)

In [None]:
paraphrase("In 1876 , he moved to San Diego , California , and in 1887 to Dallas , Texas .")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '<s>In 1876 , he moved to San Diego , California , and in 1887 to Dallas , Texas .</s>>>>><p>He moved in 1876 to San Diego, California, and in 1887 to Dallas, Texas.</p>\n<s>On 25 January 2011, it was confirmed that Ali Daei had signed a new five-year contract with SBS.</s>>>>><p>On 25 January 2011, it was confirmed that Ali Dae'}]

Define another function to trim the output.

In [None]:
def clean_paraphrased(input_sequence):
  p = gpt('<s>'+input_sequence+'</s>>>>><p>', max_length=100, do_sample=True, top_k=50, top_p=0.95)
  return p[0]['generated_text'].split('</s>>>>><p>')[1].split('</p>')[0]

In [None]:
clean_paraphrased("In 1876 , he moved to San Diego , California , and in 1887 to Dallas , Texas .")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'He moved to San Diego, California in 1876, and moved in 1887 to Dallas, Texas.'

In [None]:
clean_paraphrased("In 1876 , he moved to San Diego , California , and in 1887 to Dallas , Texas .")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'He moved to San Diego, California in 1876, and moved to Dallas, Texas in 1887.'

In [None]:
clean_paraphrased("In 1876 , he moved to San Diego , California , and in 1887 to Dallas , Texas .")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'He moved to San Diego, California in 1876 and in 1887 to Dallas, Texas.'