In [None]:
#check if highRAM is being used
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install seaborn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 15.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 11.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 33

In [None]:
#import libararies
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from sklearn.utils import shuffle
from google.colab import drive
from tqdm import tqdm
import regex as re
import string
import nltk
import pickle
import datasets
from datasets import Dataset, DatasetDict


In [None]:
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
#load the data
df=pd.read_csv("/content/drive/MyDrive/NLP_Project/final_imdb_movies.csv")

In [None]:
df.head()

Unnamed: 0,Name,link,summary,storyline
0,The Shawshank Redemption,/title/tt0111161/,Two imprisoned men bond over a number of years...,Chronicles the experiences of a formerly succe...
1,The Godfather,/title/tt0068646/,The aging patriarch of an organized crime dyna...,"The Godfather ""Don"" Vito Corleone is the head ..."
2,Pulp Fiction,/title/tt0110912/,"The lives of two mob hitmen, a boxer, a gangst...",Jules Winnfield (Samuel L. Jackson) and Vincen...
3,The Godfather Part II,/title/tt0071562/,The early life and career of Vito Corleone in ...,The continuing saga of the Corleone crime fami...
4,Inception,/title/tt1375666/,A thief who steals corporate secrets through t...,"Dom Cobb is a skilled thief, the absolute best..."


In [None]:
# print(df["link"].nunique(), df["link"].count())
# df['link'].value_counts()

786 787


In [None]:
#drop any duplicates
df.drop_duplicates(inplace=True)

In [None]:
#load punctuations
punctuations = set(string.punctuation)

In [None]:
# remove special characters, digits, html tags ans punctuations
def preprocess(text):
    text = text.lower() #converting capitals to lower case
    text = text.strip()
    text= re.sub(r'\—.*$', " ", text)
    text=re.sub(r'\([^)]*\)', '', text)
    text = re.sub("<.*?>", " ", text) #removing html tags
    text = re.sub(r'[^\w\s]',' ',text) #remove special characters
    text = re.sub(" \d+", " ",text) # removing digits
    text = re.sub("\n", "", text) #removing \n
    text = re.sub("\t", "", text) #removing \t
    text = ''.join(ch for ch in text if ch not in punctuations)
    return text

In [None]:
df["summary"] = df["summary"].apply(preprocess)

In [None]:
df["storyline"] = df["storyline"].apply(preprocess)

In [None]:
#split data
train, validate, test = np.split(df.sample(frac=1, random_state=10), [int(.6*len(df)), int(.8*len(df))])

In [None]:
# tokenizer_pre = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
# model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')


Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

In [None]:
# tokens = tokenizer_pre(train['storyline'][23], truncation = True, padding = 'longest', return_tensors = 'pt')
# summary = model.generate(**tokens)
# tokenizer_pre.decode(summary[0])

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [None]:
print(len(train), len(validate), len(test))

471 157 158


In [None]:
#convert df to Dataset type
train_data = Dataset.from_pandas(train)
validate_data = Dataset.from_pandas(validate)
test_data = Dataset.from_pandas(test)

In [None]:
#Data class
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

In [None]:
# tokenizer using pegasus
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  print(model_name)
  tokenizer = PegasusTokenizer.from_pretrained(model_name)


  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

In [None]:
# def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
#   """
#   Prepare configurations and base model for fine-tuning
#   """
#   torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
#   model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

#   if freeze_encoder:
#     for param in model.model.encoder.parameters():
#       param.requires_grad = False

#   if val_dataset is not None:
#     training_args = TrainingArguments(
#       output_dir=output_dir,           # output directory
#       num_train_epochs=2000,           # total number of training epochs
#       per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
#       per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
#       save_steps=500,                  # number of updates steps before checkpoint saves
#       save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
#       evaluation_strategy='steps',     # evaluation strategy to adopt during training
#       eval_steps=100,                  # number of update steps before evaluation
#       warmup_steps=500,                # number of warmup steps for learning rate scheduler
#       weight_decay=0.01,               # strength of weight decay
#       logging_dir='./logs',            # directory for storing logs
#       logging_steps=10,
#     )

#     trainer = Trainer(
#       model=model,                         # the instantiated 🤗 Transformers model to be trained
#       args=training_args,                  # training arguments, defined above
#       train_dataset=train_dataset,         # training dataset
#       eval_dataset=val_dataset,            # evaluation dataset
#       tokenizer=tokenizer
#     )

#   else:
#     training_args = TrainingArguments(
#       output_dir=output_dir,           # output directory
#       num_train_epochs=2000,           # total number of training epochs
#       per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
#       save_steps=500,                  # number of updates steps before checkpoint saves
#       save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
#       warmup_steps=500,                # number of warmup steps for learning rate scheduler
#       weight_decay=0.01,               # strength of weight decay
#       logging_dir='./logs',            # directory for storing logs
#       logging_steps=10,
#     )

#     trainer = Trainer(
#       model=model,                         # the instantiated 🤗 Transformers model to be trained
#       args=training_args,                  # training arguments, defined above
#       train_dataset=train_dataset,         # training dataset
#       tokenizer=tokenizer
#     )

#   return trainer

In [None]:
# train_data = Dataset.from_pandas(train)
# validate_data = Dataset.from_pandas(validate)
# test_data = Dataset.from_pandas(test)
  
train_storyline, train_summary = train_data['storyline'], train_data['summary']
val_storyline, val_summary = validate_data['storyline'], train_data['summary']
test_storyline, test_summary = test_data['storyline'], test_data['summary']
  


In [None]:
#Frees up unused memory 
import gc
torch.cuda.empty_cache()
gc.collect()

15

In [None]:
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-xsum'
train_dataset, val_datatset, test_datatset, tokenizer = prepare_data(model_name, train_storyline, train_summary, )
# trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
# trainer.train()

google/pegasus-xsum


Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

In [None]:
#load the pre-trained model
model = PegasusForConditionalGeneration.from_pretrained("/content/drive/MyDrive/results/checkpoint-1000/")

In [None]:
test_data[0]

{'Name': 'The Final Destination',
 'link': '/title/tt1144884/',
 'summary': 'a horrifying premonition saves a young man and his friends from death during a racetrack accident but terrible fates await them nonetheless ',
 'storyline': 'while watching a car race at mckinley speedway  nick o bannon has a premonition of a car crash that will result in many casualties  including several people that are in the audience  nick convinces his girlfriend lori  along with his friends hunt and janet to leave  a security guard named george lanter  along with a racist named carter  a mother and her two sons  and several other people follow nick out  shortly after they leave  nick s premonition comes true  when survivors start dying  nick  his friends  and george must try to find the remaining survivors and save them from death before it is too late   ',
 '__index_level_0__': 350}

In [None]:
#predict summary
from tqdm import tqdm  as tqdm
import time

def predict_summary(test_storyline):
  start_time = time.time()
  predictions = []
  for i,test_story in  tqdm(enumerate(test_storyline)):
    tokens = tokenizer(test_story, truncation=True, padding="longest", return_tensors="pt")
    summary_ = model.generate(**tokens)

    pred = tokenizer.decode(summary_[0])    
    predictions.append(pred)
  return predictions


In [None]:
#  for i, test_story in  enumerate(test_25_storyline):
#    print(test_story)
#    break

in this first sequel to the return of the living dead  a group of kids discover one of the drums containing a rotting corpse and release the    trioxen gas into the air  causing the dead to once again rise from the grave and seek out brains   


In [None]:
import random

test_size = len(test_data)
percent_25_len =  int(100 * int(25)/int(test_size))

test_25_sample = test.sample(percent_25_len)
test_25_sample_data = Dataset.from_pandas(test_25_sample)
test_25_storyline, test_25_summary = test_25_sample_data['storyline'], test_25_sample_data['summary']




In [None]:
predicted_summaries = predict_summary(test_25_storyline)



In [None]:
!pip install sumeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sumeval
  Downloading sumeval-0.2.2.tar.gz (80 kB)
[K     |████████████████████████████████| 80 kB 3.2 MB/s 
[?25hCollecting plac>=0.9.6
  Downloading plac-1.3.5-py2.py3-none-any.whl (22 kB)
Collecting sacrebleu>=1.3.2
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 26.4 MB/s 
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting portalocker
  Downloading portalocker-2.6.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: sumeval
  Building wheel for sumeval (setup.py) ... [?25l[?25hdone
  Created wheel for sumeval: filename=sumeval-0.2.2-py3-none-any.whl size=54549 sha256=04473a5fee3c98bf8431521c4945ab74a19aaaddc6368e9dbfd521fa037f57e2
  Stored in directory: /root/.cache/pip/wheels/6a/e8/4e/76111a2e023408af67380b35a6a910763432fc0afe20348f17
Successfully built sum

In [None]:
from sumeval.metrics.rouge import RougeCalculator
import numpy as np


rouge = RougeCalculator(stopwords=True, lang="en")
Rouge_1 = []
Rouge_2 = []
Rouge_l = []

for i in range(len(predicted_summaries)):

    rouge_1 = rouge.rouge_n(
    summary=predicted_summaries[i],
    references=test_summary[i],
    n=1)
    
    Rouge_1.append(rouge_1)
 
 
 ## rogue_2

    rouge_2 = rouge.rouge_n(
     summary=predicted_summaries[i],
     references=test_summary[i],
    n=2)
    Rouge_2.append(rouge_2)
 ## rogue_l

    rouge_l = rouge.rouge_l(
     summary=predicted_summaries[i],
     references=test_summary[i])
    Rouge_l.append(rouge_l)

print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}".format(np.round(sum(Rouge_1)/len(Rouge_1),3),
 np.round(sum(Rouge_2)/len(Rouge_2),3),
 np.round(sum(Rouge_l)/len(Rouge_l),3)
))


ROUGE-1: 0.007, ROUGE-2: 0.0, ROUGE-L: 0.007


In [None]:
dict_predicted = {'predicted': predicted_summaries, 'actual': test_25_summary}
df_predicted = pd.DataFrame.from_dict(dict_predicted)
df_predicted.to_csv('/content/drive/MyDrive/results/pegasus_25_predictions_checkpoint_1000.csv')  

In [None]:
df_predicted.head()

Unnamed: 0,predicted,actual
0,<pad>a group of kids discover one of the drums...,curious kids unearth the barrels that previous...
1,<pad>while watching a car race at mckinley spe...,a horrifying premonition saves a young man and...
2,<pad>four middle aged men decide to take a roa...,a group of suburban biker wannabes looking for...
3,<pad>a woman suffering from the loss of her so...,after being told that their children never exi...
4,<pad>a clown fish loses his son nemo after he ...,after his son is captured in the great barrier...


In [None]:

!pip install gensim
import gensim
from gensim.models import Word2Vec
import numpy as np
model_gn = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP_Project/GoogleNews-vectors-negative300.bin.gz', binary= True)
     

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#calculate wmd score
wmd = []
stop_words = stopwords.words('english')
for i in range(len(predicted_summaries)):

  print ("\n\nSummary of the algorrithm : \n",df_predicted['predicted'][i][0:len(df_predicted['predicted'][i])//2])
  print(df_predicted['predicted'][i][len(df_predicted['predicted'][i])//2:-1])
  print('The size of the resulting summary',len(df_predicted['predicted'][i]))


  print ("\n\nHuman Summary : \n",df_predicted['actual'][i][0:len(df_predicted['actual'][i])//2])
  print(df_predicted['actual'][i][len(df_predicted['actual'][i])//2:-1])
  print('The size of the Human Summary',len(df_predicted['actual'][i]))

  summary_algo = [w for w in df_predicted['predicted'][i].lower().split() if w not in stop_words]
  summary_human = [w for w in df_predicted['actual'][i] if w not in stop_words]

  # distance = model_gn.wmdistance(summary_algo, summary_human)
  distance = model_gn.wmdistance(df_predicted['predicted'][i], df_predicted['actual'][i])
  
  print("\nDistance between the summary and the Human Summary : ",distance)
  print("-"*200)
  wmd.append(distance)




Summary of the algorrithm : 
 <pad>a group of kids discover one of the drums containing a rotting corpse and release the trio
xen gas into the air causing the dead to once again rise from the grave and seek out brains</s
The size of the resulting summary 190


Human Summary : 
 curious kids unearth the barrels that previously helped revi
ve the dead  which proves the second time s an undead charm
The size of the Human Summary 120

Distance between the summary and the Human Summary :  0.6724638911885895
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Summary of the algorrithm : 
 <pad>while watching a car race at mckinley speedway nick o bannon has
 a premonition of a car crash that will result in many casualties</s
The size of the resulting summary 138


Human Summary : 
 a horrifying premonition saves a young man and his friends f

In [None]:
np.mean(wmd)


0.6684028955765523

In [None]:
tokens = tokenizer(test_storyline[0], truncation=True, padding="longest", return_tensors="pt")
summary_ = model.generate(**tokens)
tokenizer.decode(summary_[0])

'<pad>while watching a car race at mckinley speedway nick o bannon has a premonition of a car crash that will result in many casualties</s>'

In [None]:
test_storyline[0]

'while watching a car race at mckinley speedway  nick o bannon has a premonition of a car crash that will result in many casualties  including several people that are in the audience  nick convinces his girlfriend lori  along with his friends hunt and janet to leave  a security guard named george lanter  along with a racist named carter  a mother and her two sons  and several other people follow nick out  shortly after they leave  nick s premonition comes true  when survivors start dying  nick  his friends  and george must try to find the remaining survivors and save them from death before it is too late   '

In [None]:
test_summary[0]

'a horrifying premonition saves a young man and his friends from death during a racetrack accident but terrible fates await them nonetheless '