In [137]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [138]:
df = pd.read_csv('/kaggle/input/hahaha/test.csv')
df = df[['Article','Summary']]
df

Unnamed: 0,Article,Summary
0,Fear shakes Mexico border city after violence ...,The attacks took place in several neighborhood...
1,Indian-origin boy finds millions of years old ...,"Siddak Singh Jhamat, known as Sid, was using a..."
2,Representative ImageA 38-year-old Indian man h...,A 38-year-old Indian man has been charged with...
3,Residents get tested during their stay at a te...,"China on Apr 21 reported 2,119 locally transmi..."
4,Israeli PM Benjamin Netanyahu flew to Saudi Ar...,Israeli media reported Monday that Prime Minis...
...,...,...
4482,FILE - Tanks storing treated radioactive water...,A massive earthquake and tsunami in 2011 destr...
4483,Villagers clear debris caused by an earthquake...,"Over 680 tourists from France, Thailand, the N..."
4484,"Seif al-Islam, the son and one-time heir appar...","Seif al-Islam, the son and one-time heir appar..."
4485,Thai BoysMost members of the Thai youth footba...,Most members of the Thai youth football team r...


# 1. Frequency Based Method

In [139]:
!pip install -q nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from collections import Counter 
from nltk.corpus import stopwords

from nltk.tokenize import sent_tokenize, word_tokenize

def generate_summary(text, n):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words and word.isalnum()]
    word_freq = Counter(words)

    sentence_scores = {}
    for sentence in sentences:
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
        sentence_score = sum([word_freq[word] for word in sentence_words])
        if len(sentence_words) < 20:
            sentence_scores[sentence] = sentence_score
    
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:n]
    summary = ' '.join(summary_sentences)

    return summary

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [140]:
text = '''
Weather is the day-to-day or hour-to-hour change in the atmosphere. 
Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more. 
Energy from the Sun affects the weather too. 
Climate tells us what kinds of weather usually happen in an area at different times of the year. 
Changes in weather can affect our mood and life. We wear different clothes and do different things in different weather conditions. 
We choose different foods in different seasons.
Weather stations around the world measure different parts of weather. 
Ways to measure weather are wind speed, wind direction, temperature and humidity. 
People try to use these measurements to make weather forecasts for the future. 
These people are scientists that are called meteorologists. 
They use computers to build large mathematical models to follow weather trends.'''

summary = generate_summary(text, 5)
print(summary)

We wear different clothes and do different things in different weather conditions. Weather stations around the world measure different parts of weather. Climate tells us what kinds of weather usually happen in an area at different times of the year. Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more. Ways to measure weather are wind speed, wind direction, temperature and humidity.


In [141]:
ref = '''
Weather is a gradual slow change through days and hours in the atmosphere and can vary from wind to snow. 
Climate tells a lot about the weather in an area.
The livelihood of people changes according to the change in weather.
Weather stations measure different parts of weather.
People who use measurements to make weather forecasts for the future are called meteorologists, and are scientists.'''

In [142]:
!pip install rouge -q
!pip install evaluate -q
!pip install rouge-score -q
!pip install -q textstat

In [143]:
import rouge
from rouge import Rouge

def evaluate_rouge(reference_text, summary_text):
    rouge = Rouge()
    scores = rouge.get_scores(reference_text, summary_text)
    return scores[0]['rouge-1']['f']

rg = evaluate_rouge(ref, summary)
rg

0.3366336583785904

In [144]:
import textstat

lengths =[]
for i in df['Summary']:
    lt = textstat.sentence_count(i)
    lengths.append(lt)
    
len(lengths)

4487

# Testing on Data

In [150]:
gen_sum = []

k = 0
for i in df['Article']:
    sums = generate_summary(i,lengths[k])
    k = k+1
    gen_sum.append(sums)

In [154]:
df1 = pd.DataFrame()
df1['Gen'] = gen_sum
df1['Act'] = df['Summary']
df1 = df1.apply(lambda x: x.str.strip()).replace('', 'hello')
df1

In [199]:
rouge = Rouge()
score = rouge.get_scores(df1['Gen'], df1['Act'], avg = True)
RougeScore = pd.DataFrame(score).set_index([['recall','precision','f-measure']])
RougeScore

Unnamed: 0,rouge-1,rouge-2,rouge-l
recall,0.222703,0.107627,0.196091
precision,0.223371,0.097511,0.19345
f-measure,0.214927,0.098449,0.187662


In [168]:
import evaluate

rouge = evaluate.load('rouge')
results = rouge.compute(predictions= df1['Gen'], references= df1['Act'])
print(results)

{'rouge1': 0.23904744414291018, 'rouge2': 0.10623750529521572, 'rougeL': 0.1855253702076442, 'rougeLsum': 0.18552977527892361}


# 2. TF-IDF Method

In [187]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import math

ps = PorterStemmer()


def sent_preprocessing(sentences: list) -> list:
    cleaned_sentencs = [sent for sent in sentences if sent]
    for sent in sentences:
        if sent == '' or sent == ' ':
            print(1)
    return cleaned_sentencs


def text_preprocessing(sentences: list):
    stop_words = set(stopwords.words('english'))

    clean_words = None
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word.lower()) for word in words if word.isalnum()]
        clean_words = [word for word in words if word not in stop_words]

    return clean_words


def create_tf_matrix(sentences: list) -> dict:
    tf_matrix = {}

    for sentence in sentences:
        tf_table = {}
        clean_words = text_preprocessing([sentence])
        words_count = len(word_tokenize(sentence))
        word_freq = {}
        for word in clean_words:
            word_freq[word] = (word_freq[word] + 1) if word in word_freq else 1

        for word, count in word_freq.items():
            tf_table[word] = count / words_count

        tf_matrix[sentence[:15]] = tf_table

    return tf_matrix


def create_idf_matrix(sentences: list) -> dict:
    idf_matrix = {}
    documents_count = len(sentences)
    sentence_word_table = {}

    for sentence in sentences:
        clean_words = text_preprocessing([sentence])
        sentence_word_table[sentence[:15]] = clean_words

    word_in_docs = {}
    for sent, words in sentence_word_table.items():
        for word in words:
            word_in_docs[word] = (word_in_docs[word] + 1) if word in word_in_docs else 1

    for sent, words in sentence_word_table.items():
        idf_table = {}
        for word in words:
            idf_table[word] = math.log10(documents_count / float(word_in_docs[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def create_tf_idf_matrix(tf_matrix, idf_matrix) -> dict:
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix


def create_sentence_score_table(tf_idf_matrix) -> dict:
    sentence_value = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0
        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        smoothing = 1
        sentence_value[sent] = (total_score_per_sentence + smoothing) / (count_words_in_sentence + smoothing)

    return sentence_value


def find_average_score(sentence_value):
    sum = 0
    for val in sentence_value:
        sum += sentence_value[val]

    average = sum / len(sentence_value)

    return average


def generate_summary(sentences, sentence_value, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentence_value and sentence_value[sentence[:15]] >= threshold:
            summary += sentence + " "
            sentence_count += 1

    return summary

In [192]:
text = df['Article'][7]
sentences = sent_tokenize(text)
tf_matrix = create_tf_matrix(sentences)
idf_matrix = create_idf_matrix(sentences)
tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
sentence_value = create_sentence_score_table(tf_idf_matrix)
threshold = find_average_score(sentence_value)
summary = generate_summary(sentences, sentence_value, threshold)
print('Summary\n', summary)
print(f'Original {len(sent_tokenize(text))} sentences, Summarized {len(sent_tokenize(summary))} sentences')

Summary
 Belgium Prime Minister Alexander De CrooAfter a wait of 493 days since the last federal election, Belgium got its new Prime Minister Alexander De Croo on Wednesday. He is the son of Herman De Croo, a former Federal Minister, Deputy and current President of the House of Representatives and the longest-serving MP. 
Original 3 sentences, Summarized 2 sentences


# Testing on data

In [194]:
gen_sum = []
for i in df['Article']:
    text = i
    sentences = sent_tokenize(text)
    tf_matrix = create_tf_matrix(sentences)
    idf_matrix = create_idf_matrix(sentences)
    tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
    sentence_value = create_sentence_score_table(tf_idf_matrix)
    threshold = find_average_score(sentence_value)
    summary = generate_summary(sentences, sentence_value, threshold)
    gen_sum.append(summary)

In [195]:
df2 = pd.DataFrame()
df2['Gen'] = gen_sum
df2['Act'] = df['Summary']

In [196]:
df2

Unnamed: 0,Gen,Act
0,Images posted on social media showed bodies in...,The attacks took place in several neighborhood...
1,"""England at the time was part of Pangea, a lan...","Siddak Singh Jhamat, known as Sid, was using a..."
2,The conspiracy count carries a maximum 10-year...,A 38-year-old Indian man has been charged with...
3,Residents get tested during their stay at a te...,"China on Apr 21 reported 2,119 locally transmi..."
4,Israeli PM Benjamin Netanyahu flew to Saudi Ar...,Israeli media reported Monday that Prime Minis...
...,...,...
4482,"The task force, in a report issued late April,...",A massive earthquake and tsunami in 2011 destr...
4483,"According to the officials, the stranded touri...","Over 680 tourists from France, Thailand, the N..."
4484,"A statement by his captors, the Abu Bakr al-Si...","Seif al-Islam, the son and one-time heir appar..."
4485,"""They should spend time in a monastery. ""It's ...",Most members of the Thai youth football team r...


In [197]:
import evaluate

rouge = evaluate.load('rouge')
results = rouge.compute(predictions= df2['Gen'], references= df2['Act'])
print(results)

{'rouge1': 0.22960111179765497, 'rouge2': 0.10679917796905466, 'rougeL': 0.17731718082025022, 'rougeLsum': 0.1773483675898815}


In [198]:
rouge = Rouge()
score = rouge.get_scores(df2['Gen'], df2['Act'], avg = True)
RougeScore = pd.DataFrame(score).set_index([['recall','precision','f-measure']])
RougeScore

Unnamed: 0,rouge-1,rouge-2,rouge-l
recall,0.340914,0.18097,0.31354
precision,0.188269,0.086239,0.17179
f-measure,0.218576,0.10157,0.199725


# 3. TextRank Algorithm

In [214]:
import networkx as nx
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

wl = PorterStemmer()

def extract_word_vectors() -> dict:
    word_embeddings = {}
    f = open('/kaggle/input/glovembed/glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

    f.close()
    return word_embeddings


def text_preprocessing(sentences: list) -> list:
    stop_words = set(stopwords.words('english'))
    clean_words = None
    for sent in sentences:
        words = word_tokenize(sent)
        words = [wl.stem(word.lower()) for word in words if word.isalnum()]
        clean_words = [word for word in words if word not in stop_words]

    return clean_words


def sentence_vector_representation(sentences: list, word_embeddings: dict) -> list:
    sentence_vectors = []
    for sent in sentences:
        clean_words = text_preprocessing([sent])
        v = sum([word_embeddings.get(word, np.zeros(100, )) for word in clean_words]) / (len(clean_words) + 0.001)
        sentence_vectors.append(v)

    return sentence_vectors


def create_similarity_matrix(sentences: list, sentence_vectors: list) -> np.ndarray:
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0]

    return sim_mat


def determine_sentence_rank(sentences: list, sim_mat: np.ndarray):
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted([(scores[i], s[:15]) for i, s in enumerate(sentences)], reverse=True)
    return ranked_sentences


def generate_summary(sentences: list, ranked_sentences: list):
    top_ranked_sentences = ranked_sentences[:int(len(sentences) / 3)] if len(sentences) >= 3 else ranked_sentences
    sentence_count = 0
    summary = ''
    for i in sentences:
        for j in top_ranked_sentences:
            if i[:15] == j[1]:
                summary += i + ' '
                sentence_count += 1
                break

    return summary

In [228]:
text = df['Article'][0]
sentences = sent_tokenize(text.strip())
word_embeddings = extract_word_vectors()
sentence_vectors = sentence_vector_representation(sentences, word_embeddings)
sim_mat = create_similarity_matrix(sentences, sentence_vectors)
ranked_sentences = determine_sentence_rank(sentences, sim_mat)
summary = generate_summary(sentences, ranked_sentences)
print('Summary\n',summary)

AttributeError: module 'scipy.sparse' has no attribute 'coo_array'