In [1]:
#Install required library
!pip install numpy pandas faiss-gpu torch transformers sentence_transformers rouge --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#Import required libraries
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer

np.random.seed(0)
pd.set_option("max_colwidth", 100)

In [5]:
!pip install openpyxl

#Display a part of the dataset
movies = pd.read_excel('sample_data/demo1.xlsx', usecols=['TITLE', 'Description'])

print(f"Plots of {len(movies.index)} movies!")
movies.sample(2)

Plots of 1287 movies!


Unnamed: 0,TITLE,Description
655,Kung Fu Panda,"In the Valley of Peace, a land in Ancient China inhabited by anthropomorphic animals, a giant pa..."
330,Don't Look Now,Some time after the drowning of their young daughter Christine in an accident at their English c...


In [6]:
#Preprocessing/cleaning the dataset
def clean_text(text, max_words=1024):
    #Truncates a string, then removes string control characters and multiple spaces.
    text = text.split()[:max_words]
    text = ' '.join(text)
    regex = re.compile(r'[\n\r\t]')
    text = regex.sub(" ", text)
    text = re.sub(' +', ' ', text).strip()
    return text

In [7]:
clean_text("This  is a trial \r\n for   preprocessing or    cleaning.")

'This is a trial for preprocessing or cleaning.'

In [8]:
# Check for NaN values
movies['Description'].fillna('', inplace=True)  # Replace NaN with empty string

# Define a function to clean text
def clean(text):
    # Add other cleaning steps as needed (e.g., lowercasing, removing punctuation)
    cleaned_text = text.strip()
    return cleaned_text

# Apply the cleaning function to the 'Description' column
movies['Description'] = movies['Description'].apply(clean_text)

In [9]:
#Check if GPU is available or else use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [10]:
# Define the summarization model i.e.,bart
from transformers import BartTokenizer, BartForConditionalGeneration
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-3')
model.to(device)
model.eval()

tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-3')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

In [11]:
#Demo plot for summary
SAMPLE_PLOT = """Albus Dumbledore, Minerva McGonagall, and Rubeus Hagrid, professors of Hogwarts School of Witchcraft and Wizardry, deliver an orphaned infant named Harry Potter to his only remaining relatives, the Dursleys. Ten years later, Harry has been battling a disjointed life with the Dursleys, inadvertently causing an accident during a family outing, and begins receiving unsolicited letters by owls. Finally, Hagrid re-appears, and informs Harry that he is actually a wizard, and has been accepted into Hogwarts, against the Dursleys' wishes. He also tells Harry of the latter's past; Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse, with Harry being the only survivor in the chaos thus, leading to his fame in the wizarding world as "The Boy Who Lived". Hagrid takes Harry to Diagon Alley to purchase school supplies, then takes him to King's Cross station to board a train to the school. While on the train, Harry meets Ron Wea"""

In [12]:
inputs = tokenizer([SAMPLE_PLOT],
                   max_length=1024,
                   padding=True,
                   truncation=True,
                   return_tensors='pt')

# Generate Summary (max 128 tokens)
summary_ids = model.generate(inputs['input_ids'].to(device),
                             max_length=128,
                             early_stopping=True)

summaries = tokenizer.batch_decode(summary_ids,
                                   skip_special_tokens=True,
                                   clean_up_tokenization_spaces=True)

In [13]:
#Display the generated summary
summaries[0]

' Albus Dumbledore, Minerva McGonagall and Rubeus Hagrid deliver an orphaned infant named Harry Potter to his only remaining relatives, the Dursleys. Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, with Harry being the only survivor in the chaos. Hagrid re-appears and tells Harry that he is actually a wizard, and has been accepted into Hogwarts.'

In [14]:
plot_list = movies.Description.tolist()

def split_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

n = 32  # batch size
batches = list(split_list(plot_list, n))

In [15]:
#Generate summaries for all movies in the dataset
all_summaries = []
for batch in batches:
    with torch.no_grad():
        # tokenize
        inputs = tokenizer(batch,
                        max_length=1024,
                        padding=True,
                        truncation=True,
                        return_tensors='pt')

        # generate summary (max 128 tokens)
        summary_ids = model.generate(inputs['input_ids'].to(device), max_length=128, early_stopping=True).to('cpu')
        all_summaries += [txt.strip() for txt in tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)]
        del inputs, summary_ids
        torch.cuda.empty_cache()

movies['PlotSummary'] = all_summaries

In [16]:
#Save the generated summaries into an excel file
movies.to_excel('summarized_dataset.xlsx', index=False)

In [17]:
#Display the starting part of the file
movies.head()

Unnamed: 0,TITLE,Description,PlotSummary
0,21,"Ben Campbell, a mathematics major at the Massachusetts Institute of Technology, is accepted into...","Ben Campbell, a mathematics major at the Massachusetts Institute of Technology, is accepted into..."
1,10 Things I Hate About You,"Cameron James, a new student at Padua High School in the Seattle area, immediately becomes smitt...",Michael Eckman warns sophomore Bianca Stratford that her overprotective father Walter does not a...
2,101 Dalmatians(1996),"American video game designer Roger Dearly lives with his pet Dalmatian Pongo in London. One day,...",American video game designer Roger Dearly lives with his pet Dalmatian Pongo in London. Her boss...
3,12 Angry Men,"On a hot summer day, a jury in the New York County Courthouse prepares to deliberate the case of...",The 18-year-old boy is accused of killing his abusive father. A neighbor testified to witnessing...
4,12 Years a Slave,"Solomon Northup is a free African-American man in 1841, working as a violinist and living with h...","Slave trader Theophilus Freeman gives Northup the identity of ""Platt"", a runaway slave from Geor..."
