Text Rank algorithm is - **extractive summarization technique**. The assigned problem can be classified as **a single-domain-multiple-document summarization task** which means multiple articles generate a single bullet-point summary on Covid19 dataset

In [1]:
import math
import networkx as nx
import nltk
nltk.download('punkt') # one time execution
import numpy as np
import os
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag # for proper nouns
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Divyank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Custom functions

1. Cue phrases

In [2]:
def cue_phrases():
    QPhrases=["incidentally", "example", "anyway", "furthermore","according"
            "first", "second", "then", "now", "thus", "moreover", "therefore", "hence", "lastly", "finally", "summary"]
    cue_phrases={}
    
    for sentence in sent_tokens:
        cue_phrases[sentence] = 0
        word_tokens = nltk.word_tokenize(sentence)
        for word in word_tokens:
            if word.lower() in QPhrases:
                cue_phrases[sentence] += 1
    
    maximum_frequency = max(cue_phrases.values())
    
    for k in cue_phrases.keys():
        try:
            cue_phrases[k] = cue_phrases[k] / maximum_frequency
            cue_phrases[k] = round(cue_phrases[k], 3)
        except ZeroDivisionError:
            x = 0
    
    print(cue_phrases.values())
    
    return cue_phrases

2. Numerical data

In [3]:
def numeric_data():
    numeric_data = {}
    
    for sentence in sent_tokens:
        numeric_data[sentence] = 0
        word_tokens = nltk.word_tokenize(sentence)
        
        for k in word_tokens:
            if k.isdigit():
                numeric_data[sentence] += 1
    
    maximum_frequency = max(numeric_data.values())
    
    for k in numeric_data.keys():
        try:
            numeric_data[k] = (numeric_data[k]/maximum_frequency)
            numeric_data[k] = round(numeric_data[k], 3)
        except ZeroDivisionError:
            x = 0
    
    print(numeric_data.values())
    
    return numeric_data

3. Sentence length

In [4]:
def sent_len_score():
    sent_len_score={}
    
    for sentence in sent_tokens:
        sent_len_score[sentence] = 0
        word_tokens = nltk.word_tokenize(sentence)
        
        if len(word_tokens) in range(0,10):
            sent_len_score[sentence] = 1 - 0.05 * (10 - len(word_tokens))
        
        elif len(word_tokens) in range(7,20):
            sent_len_score[sentence] = 1
        
        else:
            sent_len_score[sentence] = 1 - (0.05) * (len(word_tokens) - 20)
    
    for k in sent_len_score.keys():
        sent_len_score[k] = round(sent_len_score[k], 4)
    
    print(sent_len_score.values())
    
    return sent_len_score

4. Sentence position

In [5]:
def sentence_position():
    sentence_position={}
    
    d = 1
    no_of_sent = len(sent_tokens)
    
    for i in range(no_of_sent):
        a = 1/d
        b = 1/(no_of_sent-d+1)
        sentence_position[sent_tokens[d-1]] = max(a,b)
        d += 1
    
    for k in sentence_position.keys():
        sentence_position[k] = round(sentence_position[k], 3)
    
    print(sentence_position.values())
    
    return sentence_position

5. Frequency table

In [6]:
def word_frequency():
    freqTable = {}
    
    for word in word_tokens_refined:    
        if word in freqTable:         
            freqTable[word] += 1    
        else:         
            freqTable[word] = 1
    
    for k in freqTable.keys():
        freqTable[k] = math.log10(1 + freqTable[k])

    #Compute word frequnecy score of each sentence
    word_frequency = {}
    
    for sentence in sent_tokens:
        word_frequency[sentence] = 0
        e = nltk.word_tokenize(sentence)
        f = []
        
        for word in e:
            f.append(PorterStemmer().stem(word))
        
        for word,freq in freqTable.items():
            if word in f:
                word_frequency[sentence] += freq
    
    maximum = max(word_frequency.values())
    
    for key in word_frequency.keys():
        try:
            word_frequency[key] = word_frequency[key]/maximum
            word_frequency[key] = round(word_frequency[key],3)
        except ZeroDivisionError:
            x = 0
    
    print(word_frequency.values())
    
    return word_frequency

6. Upper case

In [7]:
def upper_case():
    upper_case={}
    
    for sentence in sent_tokens:
        upper_case[sentence] = 0
        word_tokens = nltk.word_tokenize(sentence)
        
        for k in word_tokens:
            if k.isupper():
                upper_case[sentence] += 1
    
    maximum_frequency = max(upper_case.values())
    
    for k in upper_case.keys():
        try:
            upper_case[k] = (upper_case[k]/maximum_frequency)
            upper_case[k] = round(upper_case[k], 3)
        except ZeroDivisionError:
            x = 0
    
    print(upper_case.values())
    
    return upper_case

7. Proper nouns

In [8]:
def proper_noun():
    proper_noun={}
    
    for sentence in sent_tokens:
        tagged_sent = pos_tag(sentence.split())
        propernouns = [word for word, pos in tagged_sent if pos == 'NNP']
        proper_noun[sentence]=len(propernouns)
    
    maximum_frequency = max(proper_noun.values())
    
    for k in proper_noun.keys():
        try:
            proper_noun[k] = (proper_noun[k]/maximum_frequency)
            proper_noun[k] = round(proper_noun[k], 3)
        except ZeroDivisionError:
            x = 0
    print(proper_noun.values())
    
    return proper_noun

8. Word matches with heading

In [9]:
def head_match():
    head_match={}
    heading=sent_tokens[0]
    
    for sentence in sent_tokens:
        head_match[sentence]=0
        word_tokens = nltk.word_tokenize(sentence)
        
        for k in word_tokens:
            if k not in my_stopwords:
                k = PorterStemmer().stem(k)
                
                if k in PorterStemmer().stem(heading):
                    head_match[sentence] += 1
    
    maximum_frequency = max(head_match.values())
    
    for k in head_match.keys():
        try:
            head_match[k] = (head_match[k]/maximum_frequency)
            head_match[k] = round(head_match[k], 3)
        except ZeroDivisionError:
            x = 0
    
    print(head_match.values())
    
    return head_match

# Creating data frame

In [10]:
df = pd.DataFrame(columns=['a','b','c', 'd', 'upper', 'f','g','h','key','label'])
print(df)

Empty DataFrame
Columns: [a, b, c, d, upper, f, g, h, key, label]
Index: []


In [11]:
path = 'D:/COVID_19_dataset/documents/'

filelist = os.listdir(path)

for file in filelist:
    f = open(path + file, "r")
    text = f.read()
    
    sent_tokens = nltk.sent_tokenize(text)
    word_tokens = nltk.word_tokenize(text)
    word_tokens_lower = [word.lower() for word in word_tokens]
    
    my_stopwords = list(set(stopwords.words('english')))
    
    word_tokens_refined = [x for x in word_tokens_lower if x not in my_stopwords]
    
    Cue_phrases = list(cue_phrases().values())
    Key = list(cue_phrases().keys())
    Numeric_data = list(numeric_data().values())
    Sent_length_score = list(sent_len_score().values())
    Sentence_position = list(sentence_position().values())
    Upper_case = list(upper_case().values())
    Header_match = list(head_match().values())
    Word_frequency = list(word_frequency().values())
    Proper_noun = list(proper_noun().values())
    
    label = {}
    for sentence in sent_tokens:
        label[sentence] = 0
                
    o = list(label.values())
    df = df.append(pd.DataFrame({'a': Cue_phrases,'b': Numeric_data,'c': Sent_length_score,'d': Sentence_position,
                                 'upper': Upper_case, 'f': Header_match, 'g': Word_frequency,'h': Proper_noun,
                                 'key': Key,'label': o}), ignore_index = True)
    
    f.close()

dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
dict_values([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
dict_values([0.8, -2.25, 1, -0.1, 1.0, 1, 0.6, 0.35, 1, 0.85])
dict_values([1.0, 0.5, 0.333, 0.25, 0.2, 0.2, 0.25, 0.333, 0.5, 1.0])
dict_values([1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
dict_values([0.87, 1.0, 0.304, 0.478, 0.13, 0.174, 0.13, 0.13, 0.261, 0.217])
dict_values([0.484, 1.0, 0.189, 0.755, 0.238, 0.158, 0.305, 0.191, 0.212, 0.263])
dict_values([0.364, 1.0, 0.0, 0.909, 0.091, 0.0, 0.0, 0.0, 0.0, 0.0])
dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0])
dict_values([1, 0.6, 1, 0.8, 0.8, -0.05, 1, 1, 1.0, 0.45, -0.35, -0.8, 1, 0.4, 0.15, 0.5])
dict_val

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 5740: character maps to <undefined>

In [12]:
df = df.to_csv('D:/COVID_19_dataset/documents/output.csv', index = False)()

In [13]:
data = pd.read_csv('D:/COVID_19_dataset/documents/output.csv')
data.head()

Unnamed: 0,a,b,c,d,upper,f,g,h,key,label
0,0.0,0.0,0.8,1.0,1.0,0.87,0.484,0.364,Success from two leading coronavirus vaccine p...,0
1,0.0,0.0,-2.25,0.5,1.0,1.0,1.0,1.0,The fact that two coronavirus vaccines recentl...,0
2,0.0,0.0,1.0,0.333,0.0,0.304,0.189,0.0,The studies showed both vaccines provided stro...,0
3,1.0,0.0,-0.1,0.25,0.0,0.478,0.755,0.909,"""With the very good news from Pfizer and Moder...",0
4,0.0,0.0,1.0,0.2,0.0,0.13,0.238,0.091,While Gates didn't delve into the scientific r...,0


In [14]:
data.tail()

Unnamed: 0,a,b,c,d,upper,f,g,h,key,label
181,0.0,0.667,0.6,0.2,0.0,0.0,0.675,0.0,"""We talk about the 90/10 divide in global heal...",0
182,0.0,0.0,1.0,0.25,0.0,0.0,0.419,0.154,"This is part of that story,"" Ms Wenham said.",0
183,1.0,0.0,-0.8,0.333,0.0,0.25,0.663,0.077,"""But there's a difference between the fact tha...",0
184,0.0,0.0,1.0,0.5,0.333,0.375,0.28,0.077,A landmark global vaccine plan known as Covax ...,0
185,0.0,0.333,-0.05,1.0,0.667,0.375,0.592,0.538,The joint initiative - between the Gavi vaccin...,0


## Analysis

Here there are 10 columns the key column is useful because it has text entries of all the files. Printing some values to see what they look like

In [15]:
data['key'][0]

'Success from two leading coronavirus vaccine programs likely means other frontrunners will also show strong protection against COVID-19, Bill Gates said Tuesday.'

In [16]:
data['key'][100]

'The development comes nearly 10 months after news of the coronavirus began to emerge from Wuhan, China.'

In [17]:
data['label'].value_counts()

0    186
Name: label, dtype: int64

# Methodology

**Objective**: To generate a single summary for all articles

## Step 1: Split text into sentences

In [18]:
sentences = [] # empty list

for s in data['key']:
    sentences.append(sent_tokenize(s))
    
sentences = [y for x in sentences for y in x] # flatten list
sentences[:5]

['Success from two leading coronavirus vaccine programs likely means other frontrunners will also show strong protection against COVID-19, Bill Gates said Tuesday.',
 'The fact that two coronavirus vaccines recently showed strong protection against COVID-19 bodes well for other leading programs led by AstraZeneca, Novavax, and Johnson & Johnson, Bill Gates said Tuesday.The billionaire Microsoft founder and philanthropist said it will be easier to boost manufacturing and distribute these other shots to the entire world, particularly developing nations.The vaccine space has seen a flurry of good news in recent days, marked by overwhelming success in late-stage trials by both Pfizer and Moderna.',
 'The studies showed both vaccines provided strong protection against the virus compared to a placebo.',
 '"With the very good news from Pfizer and Moderna, we think it\'s now likely that AstraZeneca, Novavax, and Johnson & Johnson will also likely show very strong efficacy," Gates told journali

## Step 2: Extract word embeddings

In [19]:
word_embeddings = {}

f = open('E:/Jupyterfiles/ML_practice/Stanford/glove.6B/glove.6B.100d.txt', encoding = 'utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs

f.close()

In [20]:
len(word_embeddings)

400000

Word vectors for 400K different terms are stored in the dictionary

## Step 3: Text pre-processing

In [21]:
filter = data["key"] != ""
data_clean = data[filter]
data_clean = data_clean.dropna()

In [22]:
def preprocess_text(sen):
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [23]:
X = []
sentences = list(data["key"])
for sen in sentences:
    X.append(preprocess_text(sen))

X[:5]

['Success from two leading coronavirus vaccine programs likely means other frontrunners will also show strong protection against COVID Bill Gates said Tuesday ',
 'The fact that two coronavirus vaccines recently showed strong protection against COVID bodes well for other leading programs led by AstraZeneca Novavax and Johnson Johnson Bill Gates said Tuesday The billionaire Microsoft founder and philanthropist said it will be easier to boost manufacturing and distribute these other shots to the entire world particularly developing nations The vaccine space has seen flurry of good news in recent days marked by overwhelming success in late stage trials by both Pfizer and Moderna ',
 'The studies showed both vaccines provided strong protection against the virus compared to placebo ',
 ' With the very good news from Pfizer and Moderna we think it now likely that AstraZeneca Novavax and Johnson Johnson will also likely show very strong efficacy Gates told journalist Andrew Ross Sorkin ',
 'W

In [24]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in my_stopwords])
    
    return sen_new

In [25]:
clean_sentences = [remove_stopwords(r.split()) for r in X]
clean_sentences = [s.lower() for s in clean_sentences]

clean_sentences[:5]

['success two leading coronavirus vaccine programs likely means frontrunners also show strong protection covid bill gates said tuesday',
 'the fact two coronavirus vaccines recently showed strong protection covid bodes well leading programs led astrazeneca novavax johnson johnson bill gates said tuesday the billionaire microsoft founder philanthropist said easier boost manufacturing distribute shots entire world particularly developing nations the vaccine space seen flurry good news recent days marked overwhelming success late stage trials pfizer moderna',
 'the studies showed vaccines provided strong protection virus compared placebo',
 'with good news pfizer moderna think likely astrazeneca novavax johnson johnson also likely show strong efficacy gates told journalist andrew ross sorkin',
 'while gates delve scientific rationale behind prediction many scientists hold hope']

Use 'clean_sentences' to create vectors for sentences in data with the help of GloVe word vectors

## Step 4: Vector representation of sentences

In [26]:
# Extract word vectors
word_embeddings = {}

f = open('E:/Jupyterfiles/ML_practice/Stanford/glove.6B/glove.6B.100d.txt', encoding = 'utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1: ], dtype = 'float32')
    word_embeddings[word] = coefs

f.close()

In [27]:
sentence_vectors = []

for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split()) + 0.001)
    else:
        v = np.zeros((100,))
    
    sentence_vectors.append(v)

## Step 5: Similarity between sentences

**Algorithm used**: Cosine similarity

**Approach**: Create an empty similarity matrix and populate it with cosine similarities of sentences

In [28]:
sim_matrix = np.zeros([len(sentences), len(sentences)]) # zero matrix of size nXn

# initialise matrix with cosine similarity scores
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i!=j:
            sim_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0]

## Step 6: Apply PageRank algorithm

In [29]:
nx_graph = nx.from_numpy_array(sim_matrix)
scores = nx.pagerank(nx_graph)

## Step 7: Summary extraction

Extract top-N sentences based on rankings for summary generation

In [30]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse = True)

for i in range(10):
    print(ranked_sentences[i][1])

Meanwhile, efforts to develop an effective vaccine are continuing - although the World Health Organization (WHO) has warned that the death toll could hit two million before one is widely available.
As ministers struggle to get test-and-trace on track, BBC News spoke to key government figures, scientists and health officials who were involved from the very start to establish what went wrong - and, crucially, whether the system can be fixed to hold the virus in check until vaccines come to the rescue.
From Monday, under new government restrictions designed to tackle the fresh outbreak, residents will only be allowed to see one other person from outside their household and should work from home if possible.
Delivering a limited supply to the world
Andrea Taylor, who has been leading the Duke analysis, said the combination of advance purchase agreements and limits on the number of doses that can be manufactured in the next couple of years meant "we're heading into a scenario where the rich