## GPT-2 Model for text summarization
Code referenced from : https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/

### Installing the libraries

In [1]:
%%capture

!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install rouge

### Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import random
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, DataCollatorForLanguageModeling, DataCollatorWithPadding, Trainer, TrainingArguments, AutoConfig
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

In [4]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


### Importing the model

In [5]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"
device = torch.device(dev) 
base_model = GPT2LMHeadModel.from_pretrained('gpt2')
base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [6]:
# defining new tokens
bos = '<|endoftext|>'
eos = '<|EOS|>'
body = '<|body|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': '<pad>',
                       'sep_token': body} 

num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# defining the model config
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    sep_token_id=base_tokenizer.sep_token_id,
                                    output_hidden_states=False)

# loading base model
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50260, 768)

### Reading the dataset

In [7]:
path = "/content/drive/MyDrive/NN/amazon_review_dataset_processed.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,reviewText,summary
0,0,use knk zing cut machine like consumer-level c...,elctronic cutter warn chipbord 0.05 '' thick 0...
1,1,right 11.88 amazon prime great price two premi...,great price superior product use pazzles inspi...
2,2,lot art range sculpture paint draw stranger ai...,deluxe kit tank compressor offer lot accessori...
3,3,amendment 08/22/2013 paint really like brush ....,love da vinci cosmotop mix b watercolour mix b...
4,4,get specific reason pressure spinal cord deal ...,work exactly describe super-simple small sew m...


### Data pre-processing and train-test split

In [8]:
truncated_review = [' '.join(df.iloc[i]['reviewText'].split()[:100]) for i in range(df.shape[0])]
df['truncated_review'] = truncated_review
df['combined'] = bos + " " + df['truncated_review'] + body + df['summary'] + eos

df_nottest, df_test = train_test_split(df, train_size = 0.9, random_state = 42)
df_train, df_val = train_test_split(df_nottest, train_size=0.9, random_state=42)

df_train = df_train[['combined']]
df_val = df_val[['combined']]
df_test = df_test[['combined']]
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)
print(train_dataset.shape, val_dataset.shape, test_dataset.shape)

(9596, 2) (1067, 2) (1185, 2)


In [10]:
def tokenize_function(reviews):
  return base_tokenizer(reviews['combined'], padding=True)

tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### Fine-tuning the model

In [11]:
training_args = TrainingArguments(
    output_dir=model_articles_path,          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_articles_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=1000
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                       
    args=training_args,                 
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: __index_level_0__, combined. If __index_level_0__, combined are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9596
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7197


Step,Training Loss
500,11.0166
1000,5.899
1500,5.7781
2000,5.6946
2500,5.6492
3000,5.4864
3500,5.4787
4000,5.4522
4500,5.4419
5000,5.3973


Saving model checkpoint to ./finetunedGPT/checkpoint-1000
Configuration saved in ./finetunedGPT/checkpoint-1000/config.json
Model weights saved in ./finetunedGPT/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./finetunedGPT/checkpoint-2000
Configuration saved in ./finetunedGPT/checkpoint-2000/config.json
Model weights saved in ./finetunedGPT/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./finetunedGPT/checkpoint-3000
Configuration saved in ./finetunedGPT/checkpoint-3000/config.json
Model weights saved in ./finetunedGPT/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ./finetunedGPT/checkpoint-4000
Configuration saved in ./finetunedGPT/checkpoint-4000/config.json
Model weights saved in ./finetunedGPT/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ./finetunedGPT/checkpoint-5000
Configuration saved in ./finetunedGPT/checkpoint-5000/config.json
Model weights saved in ./finetunedGPT/checkpoint-5000/pytorch_model.bin
Saving model checkpo

TrainOutput(global_step=7197, training_loss=5.878904960942927, metrics={'train_runtime': 2906.8828, 'train_samples_per_second': 9.903, 'train_steps_per_second': 2.476, 'total_flos': 4450503485952000.0, 'train_loss': 5.878904960942927, 'epoch': 3.0})

### Generating the summaries

In [None]:
bos = base_tokenizer.bos_token
eos = base_tokenizer.eos_token
sep = base_tokenizer.sep_token

length_of_summary = 25
test_reviews = [df_val.iloc[i]['combined'].split(sep=sep)[0] for i in range(df_val.shape[0])]
test_reviews = [r + sep for r in test_reviews]
test_summaries = [df_val.iloc[i]['combined'].split(sep=sep)[1] for i in range(df_val.shape[0])]

titles = []
for review in test_reviews[:200]:
  input_ids = base_tokenizer.encode(review, return_tensors='pt').to(device)
  titles.append(base_model.generate(input_ids,
                                    min_length=len(review) + 2,
                                    max_length=len(review) + length_of_summary, 
                                    num_return_sequences=1,
                                    num_beams=2,
                                    no_repeat_ngram_size=2))

In [None]:
final = [base_tokenizer.decode(titles[i][0]).split(sep=sep)[1] for i in range(len(titles))]
final = [r.split(sep=eos)[0] for r in final]

### Saving the results

In [None]:
df_result = pd.DataFrame({'review':test_reviews[:200],'generated_summaries':final,'target_summaries':test_summaries[:200]})
df_result.to_csv('/content/drive/MyDrive/Colab Notebooks/CSC413/GPT_outputs.csv')

### Calculating the rouge scores

In [None]:
metric = load_metric('rouge')

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

calc_rouge_scores(final, test_summaries[:200])

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

{'rouge1': 9.2, 'rouge2': 0.9, 'rougeL': 8.3, 'rougeLsum': 8.3}