In [2]:
import re
import string
import nltk
from rouge import Rouge
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load the data
df = pd.read_csv('news_summary.csv', encoding='ISO-8859-1')
df=df[0:100]

In [4]:
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [5]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags and URLs
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+', '', text)
    
    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Lemmatize the tokens using spaCy
    lemmas = [token.lemma_ for token in nlp(" ".join(tokens))]

    # Remove any remaining non-alphabetic tokens
    lemmas = [lemma for lemma in lemmas if lemma.isalpha()]

    # Join the lemmas back into a string
    text = " ".join(lemmas)

    return text

# Preprocess the text in the DataFrame
df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [6]:
# Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['preprocessed_text'])

In [7]:
# Generate summaries and titles for each article
summaries = []
titles = []
for i in range(len(df)):
    # Tokenize the article text
    text = df['text'][i]
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs,
                                 num_beams=4,
                                 no_repeat_ngram_size=2,
                                 min_length=30,
                                 max_length=100,
                                 early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

    # Generate the title from the summary
    title_inputs = tokenizer.encode("summarize: " + summary, return_tensors='pt', max_length=512, truncation=True)
    title_ids = model.generate(title_inputs,
                               num_beams=4,
                               no_repeat_ngram_size=2,
                               min_length=10,
                               max_length=40,
                               early_stopping=True)
    title = tokenizer.decode(title_ids[0], skip_special_tokens=True)
    titles.append(title)

In [8]:
# Add the summaries and titles to the DataFrame
df['generated_summary'] = summaries
df['generated_title'] = titles

In [9]:
# Initialize the ROUGE metric
rouge = Rouge()

In [10]:
# Calculate ROUGE scores for summaries and titles
rouge_scores_summary = []
rouge_scores_title = []
for i in range(len(df)):
    reference = df['headlines'][i]
    summary = df['generated_summary'][i]
    title = df['generated_title'][i]

    scores_summary = rouge.get_scores(summary, reference)
    rouge_scores_summary.append(scores_summary[0])

    scores_title = rouge.get_scores(title, reference)
    rouge_scores_title.append(scores_title[0])
df['rouge_scores_summary'] = rouge_scores_summary
df['rouge_scores_title'] = rouge_scores_title

In [14]:
df = df.drop(['read_more','ctext','date'], axis=1)
df.head()

Unnamed: 0,author,headlines,text,preprocessed_text,generated_summary,generated_title,rouge_scores_summary,rouge_scores_title
0,Chhavi Tyagi,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...,administration union territory daman diu revok...,the order made it compulsory for women to tie ...,order made it compulsory for women to tie rakh...,"{'rouge-1': {'r': 0.2222222222222222, 'p': 0.1...","{'rouge-1': {'r': 0.2222222222222222, 'p': 0.1..."
1,Daisy Mowke,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...,malaika arora slammed instagram user troll div...,malaika Arora slams an Instagram user who trol...,"malaika Arora trolled her for ""divorcing a ric...","{'rouge-1': {'r': 0.7, 'p': 0.1944444444444444...","{'rouge-1': {'r': 0.4, 'p': 0.1818181818181818..."
2,Arshiya Chopra,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...,indira gandhi institute medical sciences igim ...,the indiana Gandhi institute of medical scienc...,indiana Gandhi institute of medical sciences c...,"{'rouge-1': {'r': 0.25, 'p': 0.090909090909090...","{'rouge-1': {'r': 0.25, 'p': 0.1, 'f': 0.14285..."
3,Sumedha Sehra,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,lashkar e taiba kashmir commander abu dujana k...,kabhi hum aage was killed by security forces. ...,kabhi hum aage was killed by security forces. ...,"{'rouge-1': {'r': 0.1, 'p': 0.0322580645161290...","{'rouge-1': {'r': 0.1, 'p': 0.0434782608695652..."
4,Aarushi Maheshwari,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...,hotels maharashtra train staff spot sign sex t...,hotels in Maharashtra will train staff to spot...,a mobile phone app called Rescue Me will alert...,"{'rouge-1': {'r': 0.7, 'p': 0.1707317073170731...","{'rouge-1': {'r': 0.2, 'p': 0.0645161290322580..."


In [32]:
df['text'][1]

'Malaika Arora slammed an Instagram user who trolled her for "divorcing a rich man" and "having fun with the alimony". "Her life now is all about wearing short clothes, going to gym or salon, enjoying vacation[s]," the user commented. Malaika responded, "You certainly got to get your damn facts right before spewing sh*t on me...when you know nothing about me."'

In [33]:
df['generated_summary'][1]

'malaika Arora slams an Instagram user who trolled her for "divorcing a rich man" and "having fun with the alimony" "her life now is all about wearing short clothes, going to gym or salon, enjoying vacation[s]"'

In [34]:
df['generated_title'][1]

'malaika Arora trolled her for "divorcing a rich man" and "having fun with the alimony" "her life now is all about wearing'

In [44]:
df['rouge_scores_summary'][1]

{'rouge-1': {'r': 0.7, 'p': 0.19444444444444445, 'f': 0.30434782268431004},
 'rouge-2': {'r': 0.4444444444444444,
  'p': 0.11428571428571428,
  'f': 0.1818181785640496},
 'rouge-l': {'r': 0.7, 'p': 0.19444444444444445, 'f': 0.30434782268431004}}

In [43]:
df['rouge_scores_title'][1]

{'rouge-1': {'r': 0.4, 'p': 0.18181818181818182, 'f': 0.2499999957031251},
 'rouge-2': {'r': 0.2222222222222222,
  'p': 0.09523809523809523,
  'f': 0.13333332913333346},
 'rouge-l': {'r': 0.4, 'p': 0.18181818181818182, 'f': 0.2499999957031251}}

In [42]:
df['text'][57]

'Producer of \'Babumoshai Bandookbaaz\' Kiran Shroff, while talking about sexist comments she faced, has said that a Central Board of Film Certification (CBFC) member asked her how she could make a film like this being a woman. A male member added, "But she is not a woman. Look at what she is wearing." Kiran said that such thoughts were regressive.'

In [40]:
df['generated_summary'][57]

"producer of 'Babumoshai Bandookbaaz' Kiran Shroff says she faced sexist comments. a member of the Central Board of Film Certification asked her how she could make film such as this being the woman."

In [41]:
df['generated_title'][57]

"producer of 'Babumoshai Bandookbaaz' says she faced sexist comments."

In [45]:
df['rouge_scores_summary'][57]

{'rouge-1': {'r': 0.25, 'p': 0.06896551724137931, 'f': 0.10810810471877293},
 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-l': {'r': 0.25, 'p': 0.06896551724137931, 'f': 0.10810810471877293}}

In [46]:
df['rouge_scores_title'][57]

{'rouge-1': {'r': 0.125, 'p': 0.1111111111111111, 'f': 0.11764705384083066},
 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-l': {'r': 0.125, 'p': 0.1111111111111111, 'f': 0.11764705384083066}}