# Google Drive and GitHub

In [35]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
%cd /content/drive/MyDrive/Project/Text_Summarization_using_NLP-Project

/content/drive/MyDrive/Project/Text_Summarization_using_NLP-Project


In [37]:
!ls

 lit.docx  'Project Data Management Plan.docx'	 Reference   textSum.ipynb
 logs	    Project_NLP.ipynb			 results     wandb


In [None]:
# Check the status of the repository
!git status

Refresh index: 100% (40/40), done.
On branch main


In [None]:
# Stage the changes
# !git add textrank.ipynb  # or use !git add . to stage all changes
!git add .

!git config --global user.email "sandrabinu99@gmail.com"
!git config --global user.name "sandrabinu3"

# # Commit the changes with a message
!git commit -m "evaluation"

# # Push the changes to your GitHub repository
# !git push origin main
!git push https://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@github.com/sandrabinu3/Text_Summarization_using_NLP-Project.git


# Libraries



In [None]:
!pip install rouge
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec # Import the Word2Vec class from the gensim library
from tqdm import tqdm
from gensim.models import KeyedVectors
from datasets import Dataset,load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
# Load the CSV files
test_df = pd.read_csv('/content/drive/MyDrive/Project/test_data.csv')
train_df = pd.read_csv('/content/drive/MyDrive/Project/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Project/validation_data.csv')

In [None]:
train_subset = train_df.sample(n=10000,random_state=33)
test_subset = test_df.sample(n=2000,random_state=33)
val_subset = validation_df.sample(n=100,random_state=20)

# Exploratory Data Analysis

In [None]:
# Check the shape of each dataset
print(f"Train set: {train_df.shape}")
print(f"Test set: {test_df.shape}")
print(f"Validation set: {validation_df.shape}")

In [None]:
#check for the basic informations and null values
print(train_subset.info())

In [None]:
print(test_subset.info())

In [None]:
print(val_subset.info())

In [None]:

print(train_subset.head())

In [None]:
# length of words in article and summary
train_subset['article_length'] = train_subset['article'].apply(lambda x: len(x.split()))
train_subset['summary_length'] = train_subset['highlights'].apply(lambda x: len(x.split()))

In [None]:
# Visualization
sns.histplot(train_subset['article_length'], bins=50, kde=True)

In [None]:
sns.histplot(train_subset['summary_length'], bins=50, kde=True)

In [None]:
# Summary statistics
summary_stats = train_subset['summary_length'].describe()
print(summary_stats)

In [None]:
### If the standard deviation is large (e.g., comparable to the mean) or
###  the histogram shows a wide range of lengths, there may be inconsistency in the summary lengths.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
wordcloud = WordCloud(stopwords=stop_words, background_color="white").generate(" ".join(train_subset['article']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
val_subset.head()

# TextRank

Data Preprocessing for TextRank

In [None]:
val_summaries = val_subset['highlights'].tolist()

In [None]:
##Function to preprocess the text
def preprocess_text_tr(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'\W',' ',text)

    # tokenize sentences and words
    words = word_tokenize(text)

    # Remove stopwords and perform lemmatization
    processed_words = [lemmatizer.lemmatize(word) for word in words
                       if word not in stop_words]

    # Return cleaned text as a single string
    return ' '.join(processed_words)

In [None]:
# Function to split text into sentences
def split_into_sentences(text):
    return sent_tokenize(text)

TextRank Model Implementation

In [None]:

# TextRank Summarization Function
def textrank_summarizer(text, top_n=3):
    # Preprocess and split text into sentences
    sentences = split_into_sentences(text)

    # If there is only one sentence, return it as the summary
    if len(sentences) <= 1:
        return text
    # Preprocess each sentence
    cleaned_sentences = [preprocess_text_tr(sentence) for sentence in sentences]

    # Vectorize the sentences using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

   # graph is created from similarity matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)

    #calculate sentence scores using PageRank algorithm
    scores = nx.pagerank(nx_graph)

    # Rank the sentences based on their scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Select the top N sentences for the summary
    summary = " ".join([ranked_sentences[i][1] for i in range(min(top_n, len(ranked_sentences)))])

    return summary


TextRank Evaluation



In [None]:
# Run summarization on validation set for initial evaluation
textrank_val_summaries = [textrank_summarizer(article) for article
                          in val_subset['article']]


In [None]:
rouge = Rouge()
scores = rouge.get_scores(textrank_val_summaries,val_summaries,avg=True)
print("TextRank ROUGE Scores:")
scores

# Word2Vec

Data Preprocessing for Word2Vec

In [None]:
def preprocess_text_wv(text):
    # Basic text cleaning
    text=re.sub(r'\W',' ', text)

    # Sentence tokenization
    sentences = sent_tokenize(text)
    processed_sentences = []

    # Word tokenization and stopword removal
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        processed_sentences.append(words)

    return sentences, processed_sentences

In [None]:
processed_w2v = [preprocess_text_wv(article) for article in tqdm(train_subset['article'])]
tokenized_sentences = [tokens for _, tokenized in processed_w2v for tokens in tokenized]

Word2Vec Training and Implementation

In [None]:
w_model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=5,min_count=5,workers=5, sg=1)

In [None]:
def sentence_to_vector(sentence, model):
    #Convert a sentence to a vector by averaging word vectors.

    words = preprocess_text_wv(sentence)
    # Flatten the list if `preprocess_text` returns a list of lists
    if isinstance(words[0], list): # check if words is list of lists
        words = [word for sublist in words for word in sublist] # flatten it

    word_vectors = []

    # Get word vectors for words in the sentence
    for word in words:
        # Convert the word to string before checking if it's in the vocabulary
        if isinstance(word, list): # Check if word is a list
            word = ' '.join(word) # Convert the list to string

        if word in model.wv.key_to_index:  # Check if the word is in the model's vocabulary
            word_vectors.append(model.wv[word])

    # Return the average word vector for the sentence
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        # If no word vectors found, return a zero vector (for empty sentences or unknown words)
        return np.zeros(model.vector_size)

In [None]:
def calculate_sentence_similarity(sentences, model):
  #Calculate the cosine similarity between each pair of sentences based on their vectors.

    # Convert each sentence to its vector representation
    sentence_vectors = np.array([sentence_to_vector(sentence, model) for sentence in sentences])

    # Compute the cosine similarity matrix
    similarity_matrix = cosine_similarity(sentence_vectors)
    return similarity_matrix

In [None]:
def extractive_summary(text, model, top_n=3):
    #Generate an extractive summary by selecting top n sentences based on cosine similarity.

    sentences = text.split('.')  # Split text into sentences
    similarity_matrix = calculate_sentence_similarity(sentences, model)

    # Get similarity scores for each sentence (using the first sentence as a reference)
    similarity_scores = similarity_matrix[0]  # Assuming first sentence as the reference

    # Rank sentences by similarity scores
    sorted_similarities = sorted(enumerate(similarity_scores), key=lambda x: x[1], reverse=True)

    # Extract top N sentences
    top_sentences = [sentences[idx] for idx, _ in sorted_similarities[:top_n]]
    return ' '.join(top_sentences)


Word2Vec Evaluation

In [None]:
# For validation data
validation_summaries_wv = []
for idx, row in val_subset.iterrows():
    text = row['article']
    summary = extractive_summary(text, w_model, top_n=3)  # Extract top 3 sentences for the summary
    validation_summaries_wv.append(summary)

In [None]:
# Add the summary to the validation DataFrame
val_subset['summary_wv'] = validation_summaries_wv

In [None]:
val_summaries = val_subset['highlights'].tolist()
w2v_val_summaries = val_subset['summary_wv'].tolist()

In [None]:
rouge = Rouge()
w2v_scores = rouge.get_scores(w2v_val_summaries, val_summaries,avg=True)
w2v_scores

# T5

Data Preprocessing for T5

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_subset)
test_dataset = Dataset.from_pandas(test_subset)

In [None]:
def preprocess_data_t5(data, tokenizer, max_input_length=512, max_target_length=128):
    # Add the 'summarize:' prefix to each article to indicate the task type
    inputs = ["summarize: " + doc for doc in data["article"]]
    # Get the target summaries (highlights)
    targets = data['highlights']
    # Tokenize the input articles
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Tokenize the target summaries (highlights)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    # Assign the tokenized target summaries as labels for the model
    model_inputs["labels"] = labels["input_ids"]
    # Return the processed inputs and labels in the required format
    return model_inputs

In [None]:
# Load tokenizer and model
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# preprocess and tokenize datasets
train_dataset = train_dataset.map(lambda x: preprocess_data_t5(x, tokenizer), batched=True)
test_dataset = test_dataset.map(lambda x: preprocess_data_t5(x, tokenizer), batched=True)


In [None]:
# Define the list of columns to be included in the PyTorch format
columns = ['input_ids', 'attention_mask', 'labels']

# Convert the train dataset into a PyTorch-compatible format
# by setting the specified columns as torch tensors
train_dataset.set_format(type="torch", columns=columns)

# Convert the test dataset into a PyTorch-compatible format
# by setting the specified columns as torch tensors
test_dataset.set_format(type="torch", columns=columns)

T5 Initialization and Fine-Tuning

In [None]:
# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name)

In [None]:
# Define the training arguments for fine-tuning the model
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/Project/t5-small-summarizer",  # Directory to save model checkpoints and results
    eval_strategy="epoch",  # Evaluation strategy: perform evaluation at the end of each training epoch
    learning_rate=2e-5,  # Learning rate for the optimizer (how much to adjust the weights during training)
    per_device_train_batch_size=18,  # Batch size for training on each device (GPU/CPU)
    per_device_eval_batch_size=18,  # Batch size for evaluation on each device
    num_train_epochs=3,  # Number of times to go through the entire training dataset
    save_steps=500,  # Number of steps between model checkpoint saves
    save_total_limit=2,  # Limit the number of saved checkpoints to avoid excessive storage usage
    predict_with_generate=True,  # Whether to use the modelâ€™s text generation functionality for predictions (for Seq2Seq tasks like summarization)
    fp16=True,  # Whether to use mixed precision training (use half-precision floating point format) to reduce memory usage and speed up training
    weight_decay=0.01,  # Weight decay (L2 regularization) to prevent overfitting
    metric_for_best_model="loss")  # The metric used to determine the best model (based on the lowest loss)


In [None]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator)

In [None]:
trainer.train()

T5 Evaluation

In [None]:
model_name='/content/drive/MyDrive/Project/t5-small-summarizer/checkpoint-5000'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Initialize a list to store generated summaries and highlights
generated_summaries = []

# Iterate through each article in the validation dataset
for i in range(len(val_subset)):
    val_articles = val_subset.iloc[i]['article']
    highlight = val_subset.iloc[i]['highlights']

    # Tokenize the input text
    inputs = tokenizer(val_articles, max_length=512, truncation=True, return_tensors='pt')

    # Generate the summary
    summary_ids = model.generate(
        **inputs,
        max_length=150,
        num_beams=5,
        do_sample=True,
        temperature=1.2,  # Encourages more randomness
        top_k=100,         # Considers top 50 probable tokens
        top_p=0.95,        # Samples from the top 90% probability mass
        repetition_penalty=1.1,  # Penalizes token repetition
        early_stopping=True)

    # Decode the summary and append it to the list
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(summary)


In [None]:
val_summaries = val_subset['highlights'].tolist()
gen_summaries = generated_summaries

In [None]:
rouge = Rouge()
t5_scores=rouge.get_scores(gen_summaries,val_summaries,avg=True)
t5_scores

In [None]:
gen_summaries[9]

In [None]:
val_summaries[9]

In [None]:
val_subset.iloc[9]['article']

# **Comparison and Anlaysis**




In [None]:
# Combine all scores into a single dictionary
rouge_scores = {
    'TextRank': scores,
    'Word2Vec': w2v_scores,
    'T5': t5_scores
}

In [None]:
# Print the combined dictionary
print(rouge_scores)

In [None]:
# Convert to DataFrame
df = pd.DataFrame.from_dict({(model, metric): values
                             for model, metrics in rouge_scores.items()
                             for metric, values in metrics.items()})

df = df.transpose()
df.reset_index(inplace=True)
df.columns = ['Model', 'Metric', 'Recall', 'Precision', 'F1-Score']
df

In [None]:
# Flattening the data
rows = []
for models, metrics in rouge_scores.items():
    for metric, scores in metrics.items():
        for score_type, value in scores.items():
            rows.append({
                'Model': models,
                'Metric': metric,
                'Score Type': score_type,
                'Value': value
            })

# Creating the DataFrame
df = pd.DataFrame(rows)


In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, x='Metric', y='Value', hue='Model', errorbar=None, palette='viridis')
plt.title('ROUGE Scores by Metric and Model', fontsize=14)
plt.ylabel('Score', fontsize=12)
plt.xlabel('ROUGE Metric', fontsize=12)
plt.legend(title='Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Separate the data by Metric and plot each as a bar plot
metrics = ['rouge-1', 'rouge-2', 'rouge-l']
for metric in metrics:
    plt.figure(figsize=(8, 6))
    sns.barplot(
        data=df[df['Metric'] == metric],
        x='Score Type', y='Value', hue='Model', palette='Set2'
    )
    plt.title(f'Comparison of {metric.upper()} Scores by Model', fontsize=16)
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Score Type (Recall, Precision, F1)', fontsize=12)
    plt.legend(title='Model', loc='upper center')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()