In [1]:
# Installing required libraries
!pip install numpy pandas faiss-gpu torch transformers sentence_transformers rouge --quiet
!pip install rouge-score
!pip install simplet5
!pip install wikipedia

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
simplet5 0.1.4 requires transformers==4.16.2, but you have transformers 4.42.3 which is incompatible.[0m[31m
Collecting transformers==4.16.2 (from simplet5)
  Using cached transformers-4.16.2-py3-none-any.whl (3.5 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.3
    Uninstalling transformers-4.42.3:
      Successfully uninstalled transformers-4.42.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.0.1 requires transformers<5.0.0,>=4.34.0, but yo

In [2]:
#Importing all the libraries required
import re
import numpy as np
import pandas as pd
import faiss
import torch
from torch.utils.data import DataLoader
from simplet5 import SimpleT5
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, T5Tokenizer, T5ForConditionalGeneration
from rouge import Rouge
from rouge_score import rouge_scorer
import wikipedia
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [3]:
#Reading the dataset(Initial-MovieDataset1)
movies = pd.read_excel('sample_data/MovieDataset1.xlsx')
movies.head()

Unnamed: 0,TITLE,Description
0,21,"Ben Campbell, a mathematics major at the Massa..."
1,10 Things I Hate About You,"Cameron James, a new student at Padua High Sch..."
2,101 Dalmatians(1996),American video game designer Roger Dearly live...
3,12 Angry Men,"On a hot summer day, a jury in the New York Co..."
4,12 Years a Slave,Solomon Northup is a free African-American man...


In [4]:
#Preprocess the text with first splitting the sentence into seperate words and set a maximum number of 1024 words
def preprocess_text(text, max_words=1024):
    if isinstance(text, float) and pd.isnull(text):
        return ''

    # Split text into words and limit to max_words
    words = text.split()[:max_words]

    # Join words back into a single string
    text = ' '.join(words)

    # Remove newline, carriage return, and tab characters
    regex = re.compile(r'[\n\r\t]')
    text = regex.sub(" ", text)

    # Remove extra spaces
    text = re.sub(' +', ' ', text).strip()

    return text

In [5]:
#Apply Preprocessing to Description column
movies['Description'] = movies['Description'].apply(preprocess_text)

In [6]:
#Check if GPU is available or else use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [7]:
#Training Data
plots = []
for i in movies.Description:
    try:
        k = i.split(" ")
        if 1110 > len(k) > 591:
            plots.append(i)
    except:
        continue

In [8]:
plot_list = movies.Description.tolist()

def split_list(lst, batch):
    for i in range(0, len(lst), batch):
        yield lst[i:i + batch]

batch = 32
batches = list(split_list(plot_list, batch))

In [9]:
#BART model specifications
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-3')
model.to(device)
model.eval()
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-3')

In [10]:
#Generate summaries using BART
all_summaries = []
for batch in batches:
    with torch.no_grad():
        inputs = tokenizer(batch,
                           max_length=1024,
                           padding=True,
                           truncation=True,
                           return_tensors='pt')

        summary_ids = model.generate(inputs['input_ids'].to(device), max_length=500, early_stopping=True).to('cpu')

        all_summaries+= [txt.strip() for txt in tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)]
        del inputs, summary_ids
        torch.cuda.empty_cache()
movies['Plot Summary(bart)'] = all_summaries

In [11]:
#Save generated summaries into an excel file
movies=movies.to_excel('MovieDataset2.xlsx', index=False)

In [16]:
#Display generated bart summaries
op=pd.read_excel("MovieDataset2.xlsx")
op.head()

Unnamed: 0,TITLE,Description,Plot Summary(bart)
0,21,"Ben Campbell, a mathematics major at the Massa...","Ben Campbell, a mathematics major at the Massa..."
1,10 Things I Hate About You,"Cameron James, a new student at Padua High Sch...",Michael Eckman warns sophomore Bianca Stratfor...
2,101 Dalmatians(1996),American video game designer Roger Dearly live...,American video game designer Roger Dearly live...
3,12 Angry Men,"On a hot summer day, a jury in the New York Co...",The 18-year-old boy is accused of killing his ...
4,12 Years a Slave,Solomon Northup is a free African-American man...,Slave trader Theophilus Freeman gives Northup ...


In [20]:
#Generate summaries for movies not in the dataset by referring to wikipedia and using t5 model

import nltk
from nltk.tokenize import sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge import Rouge
import wikipedia

# Initialize NLTK tokenizer and download punkt if necessary
nltk.download('punkt')

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Initialize Rouge
rouge = Rouge()

def fetch_movie_plot(movie_name):
    try:
        # Directly fetch the page for the movie
        page = wikipedia.page(movie_name)
        content = page.content

        # Find the plot section and extract the content
        plot_section = ""
        start_idx = content.find("== Plot ==")

        if start_idx == -1:
            start_idx = content.find("== Plot==")  # Check for variations in section header

        if start_idx != -1:
            end_idx = content.find("\n==", start_idx + len("== Plot =="))
            if end_idx == -1:
                plot_section = content[start_idx:].strip()  # If end of section not found, take until end of content
            else:
                plot_section = content[start_idx:end_idx].strip()
        else:
            plot_section = content

        return plot_section

    except wikipedia.exceptions.PageError:
        return None  # Return None when page is not found
    except wikipedia.exceptions.DisambiguationError as e:
        # If there are multiple options, you can handle it here (e.g., select the first one)
        try:
            page = wikipedia.page(e.options[0])
            content = page.content

            plot_section = ""
            start_idx = content.find("== Plot ==")

            if start_idx == -1:
                start_idx = content.find("== Plot==")  # Check for variations in section header

            if start_idx != -1:
                end_idx = content.find("\n==", start_idx + len("== Plot =="))
                if end_idx == -1:
                    plot_section = content[start_idx:].strip()  # If end of section not found, take until end of content
                else:
                    plot_section = content[start_idx:end_idx].strip()
            else:
                plot_section = content

            return plot_section

        except wikipedia.exceptions.PageError:
            return None  # Return None when page is not found
        except wikipedia.exceptions.DisambiguationError:
            return None  # Return None for disambiguation error


def generate_summary(content, max_length=1024):
    if content:
        # Tokenize the content into sentences using NLTK
        sentences = sent_tokenize(content)

        # Generate summary using T5
        inputs = tokenizer.encode("summarize: " + content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=900, min_length=100, length_penalty=5.0, num_beams=10)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Extract unique sentences from the generated summary
        unique_summary = []
        for sentence in sent_tokenize(summary):
            if sentence not in unique_summary:
                unique_summary.append(sentence)

        # Join unique sentences back into a summary
        final_summary = '. '.join(unique_summary)

        return final_summary
    else:
        return None  # Return None if content is None


def get_plot_summary(movie_title, op):
    summary = op.loc[op['TITLE'] == movie_title, 'Plot Summary(bart)']
    if not summary.empty:
        content = op.loc[op['TITLE'] == movie_title, 'Description'].values[0]
        return summary.values[0], content  # Return both summary and content
    else:
        wikipedia_summary = fetch_movie_plot(movie_title)
        if wikipedia_summary:
            t5_summary = generate_summary(wikipedia_summary)
            return t5_summary, wikipedia_summary
        else:
            return f"Plot summary for '{movie_title}' not found in dataset or Wikipedia.", None

# Testing and evaluation
# ROUGE-Recall-Oriented Understudy for Gisting Evaluation

# Main loop(user input)
while True:
    movie_title = input("\nEnter title of the movie to summarize (type 'exit' to quit):\n")
    if movie_title.lower() == 'exit':
        print("\nExited")
        break
    else:
        summary, content = get_plot_summary(movie_title, op)
        if isinstance(summary, str):
            print(summary)
        else:
            print(summary)

        if content and summary and isinstance(summary, str):
            # Calculate ROUGE-L scores
            rouge_scores = rouge.get_scores(summary, content, avg=True)

            # Print ROUGE-L scores
            print("\nROUGE-L Scores:")
            print(f"Precision:", rouge_scores['rouge-l']['p'])
            print(f"Recall:", rouge_scores['rouge-l']['r'])
            print(f"F1 Score:", rouge_scores['rouge-l']['f'])
        else:
            print("\nUnable to calculate ROUGE-L scores.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Enter title of the movie to summarize (type 'exit' to quit):
fhtg
Plot summary for 'fhtg' not found in dataset or Wikipedia.

Unable to calculate ROUGE-L scores.

Enter title of the movie to summarize (type 'exit' to quit):
12 Angry Men
The 18-year-old boy is accused of killing his abusive father. A neighbor testified to witnessing the defendant stab his father, from her window, through the windows of a passing train. In a preliminary vote, all jurors vote "guilty" except Juror 8, who believes there should be some discussion before the verdict is made. Juror 3, infuriated, argues with and tries to attack Juror.

ROUGE-L Scores:
Precision: 1.0
Recall: 0.1569767441860465
F1 Score: 0.2713567815742027

Enter title of the movie to summarize (type 'exit' to quit):
chhichhore
anirudh "anni" pathak is a divorced middle-aged man living with his teenage son.. he is awaiting the results of his entrance examination in the hope of enrolling in the iit-jEE.. he deliberately slips off the balcony to