In [2]:
import pandas as pd, numpy as np, re
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from string import punctuation
from nltk.corpus import stopwords
from statistics import mean
from heapq import nlargest

stopwords = set(stopwords.words('english'))
punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'

In [3]:
df = pd.read_csv('/kaggle/input/covid19-singapore-policies-dataset/News Article/articles1.csv')
df.sample()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
1419,1419,18882,SpaceX Plans to Send 2 Tourists Around Moon in...,New York Times,Kenneth Chang,2017-02-28,2017.0,2.0,,"SpaceX, the ambitious rocket company headed by..."


In [4]:
df.isna().sum()

Unnamed: 0         0
id                 0
title              0
publication        0
author          6306
date               0
year               0
month              0
url            50000
content            0
dtype: int64

In [5]:
df.drop(['Unnamed: 0','author','url'],axis=1,inplace=True)

In [6]:
df.sample()

Unnamed: 0,id,title,publication,date,year,month,content
45139,67341,"By releasing her music on Spotify, Taylor Swif...",Business Insider,2017-06-11,2017.0,6.0,’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ...


In [7]:
df.isnull().sum()

id             0
title          0
publication    0
date           0
year           0
month          0
content        0
dtype: int64

In [8]:
df.year.value_counts()

2016.0    28451
2017.0    17908
2015.0     3326
2013.0      212
2014.0       76
2012.0       26
2011.0        1
Name: year, dtype: int64

In [9]:
contractions_dict = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"doesn’t": "does not",
"don't": "do not",
"don’t": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y’all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
"ain’t": "am not",
"aren’t": "are not",
"can’t": "cannot",
"can’t’ve": "cannot have",
"’cause": "because",
"could’ve": "could have",
"couldn’t": "could not",
"couldn’t’ve": "could not have",
"didn’t": "did not",
"doesn’t": "does not",
"don’t": "do not",
"don’t": "do not",
"hadn’t": "had not",
"hadn’t’ve": "had not have",
"hasn’t": "has not",
"haven’t": "have not",
"he’d": "he had",
"he’d’ve": "he would have",
"he’ll": "he will",
"he’ll’ve": "he will have",
"he’s": "he is",
"how’d": "how did",
"how’d’y": "how do you",
"how’ll": "how will",
"how’s": "how is",
"i’d": "i would",
"i’d’ve": "i would have",
"i’ll": "i will",
"i’ll’ve": "i will have",
"i’m": "i am",
"i’ve": "i have",
"isn’t": "is not",
"it’d": "it would",
"it’d’ve": "it would have",
"it’ll": "it will",
"it’ll’ve": "it will have",
"it’s": "it is",
"let’s": "let us",
"ma’am": "madam",
"mayn’t": "may not",
"might’ve": "might have",
"mightn’t": "might not",
"mightn’t’ve": "might not have",
"must’ve": "must have",
"mustn’t": "must not",
"mustn’t’ve": "must not have",
"needn’t": "need not",
"needn’t’ve": "need not have",
"o’clock": "of the clock",
"oughtn’t": "ought not",
"oughtn’t’ve": "ought not have",
"shan’t": "shall not",
"sha’n’t": "shall not",
"shan’t’ve": "shall not have",
"she’d": "she would",
"she’d’ve": "she would have",
"she’ll": "she will",
"she’ll’ve": "she will have",
"she’s": "she is",
"should’ve": "should have",
"shouldn’t": "should not",
"shouldn’t’ve": "should not have",
"so’ve": "so have",
"so’s": "so is",
"that’d": "that would",
"that’d’ve": "that would have",
"that’s": "that is",
"there’d": "there would",
"there’d’ve": "there would have",
"there’s": "there is",
"they’d": "they would",
"they’d’ve": "they would have",
"they’ll": "they will",
"they’ll’ve": "they will have",
"they’re": "they are",
"they’ve": "they have",
"to’ve": "to have",
"wasn’t": "was not",
"we’d": "we would",
"we’d’ve": "we would have",
"we’ll": "we will",
"we’ll’ve": "we will have",
"we’re": "we are",
"we’ve": "we have",
"weren’t": "were not",
"what’ll": "what will",
"what’ll’ve": "what will have",
"what’re": "what are",
"what’s": "what is",
"what’ve": "what have",
"when’s": "when is",
"when’ve": "when have",
"where’d": "where did",
"where’s": "where is",
"where’ve": "where have",
"who’ll": "who will",
"who’ll’ve": "who will have",
"who’s": "who is",
"who’ve": "who have",
"why’s": "why is",
"why’ve": "why have",
"will’ve": "will have",
"won’t": "will not",
"won’t’ve": "will not have",
"would’ve": "would have",
"wouldn’t": "would not",
"wouldn’t’ve": "would not have",
"y’all": "you all",
"y’all": "you all",
"y’all’d": "you all would",
"y’all’d’ve": "you all would have",
"y’all’re": "you all are",
"y’all’ve": "you all have",
"you’d": "you would",
"you’d’ve": "you would have",
"you’ll": "you will",
"you’ll’ve": "you will have",
"you’re": "you are",
"you’re": "you are",
"you’ve": "you have",
}

In [10]:
contractions_dict["you'd"]

'you would'

In [11]:
def clean_html(raw):
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', raw)
    return text

In [12]:
con_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))

In [13]:
def expand_contractions(raw):
    def replace(match):
        return contractions_dict[match.group(0)]
    return con_re.sub(replace, raw)

In [14]:
def preprocessing(article):
    # make all strings lower
    a = article.lower()
    # remove html tags
    a = clean_html(a)
    # remove email-ids
    a = re.sub('\S+@\S+\.com','',a)
    # remove urls
    a = re.sub("((http\://|https\://|ftp\://)|(www.))+(([a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9%:/-_\?\.'~]*)?",'',a)
    # remove non-breaking spaces '\xa0'
    a = re.sub('\xa0', '', a)
    # expand all contractions
    a = expand_contractions(a)
    # stripping the possessives
    c = re.compile('(%s)'%"'s|’s")
    a = c.sub('', a)
    # copy article for sentense tokenization
    global art
    art = a
    # remove punctuation
    a = ''.join(w for w in a if w not in punctuation)
    # remove stopwords
    a = ' '.join(w for w in a.split() if w not in stopwords)
    # remove trailing and leading and double white spaces
    a = a.strip()
    a = re.sub('  +', ' ',a)
    return a

In [15]:
def normalize(word_freq):
    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] = word_freq[word]/max_freq
    return word_freq

In [16]:
def word_frequency(article):
    word_freq = {}
    for word in word_tokenize(article):
        if word not in word_freq.keys():
            word_freq[word] = 1
        else:
            word_freq[word] += 1
    return word_freq

In [17]:
def sent_token(article):
    sent = sent_tokenize(article)
    sent_list = []
    for t in sent:
        token = ''.join(w for w in t if w not in punctuation)
        token = re.sub(' +',' ',token)
        token = token.strip()
        sent_list.append(token) 
    return sent_list

In [18]:
def sent_score(sent_list, norm_word_freq):
    sent_score = {}
    for sent in sent_list:
        for word in word_tokenize(sent):
            if word in norm_word_freq.keys():
                if sent not in sent_score.keys():
                    sent_score[sent] = norm_word_freq[word]
                else:
                    sent_score[sent] += norm_word_freq[word]
    return sent_score

In [19]:
def summarize(article):
    a = preprocessing(article)
    word_freq = word_frequency(a)
    norm_word_freq = normalize(word_freq)
    sent_list = sent_token(art)
    score = sent_score(sent_list, norm_word_freq)
    length = int(len(score)*.25)
    summ = nlargest(length, score, key = score.get)
    return '.'.join(summ)

In [20]:
ht = "<h1> Hello! how  r’s u?. visit site- www.gmail.com You've that? mail - abc@cba.com </h1>"
simplified = summarize(df.content[5])
print(len(df.content[5]),'/', len(simplified))
print(simplified)
print('----------------\n',df.content[5])

911 / 182
london queen elizabeth ii who has been battling a cold for more than a week missed a new year day church service at her country estate in sandringham buckingham palace said on sunday
----------------
 LONDON  —   Queen Elizabeth II, who has been battling a cold for more than a week, missed a New Year’s Day church service at her country estate in Sandringham, Buckingham Palace said on Sunday. A week earlier, the queen, who is 90, missed a Christmas Day church service, for the first time since 1988, because of the illness. “The Queen does not yet feel ready to attend church as she is still recuperating from a heavy cold,” the palace said in a statement. The queen’s husband, Prince Philip, who had also been ill, was well enough to attend both services, in the church at Sandringham, which is in Norfolk, on the east coast of England. The queen, who ascended to the throne in 1952, became the world’s   monarch following the death of King Bhumibol Adulyadej of Thailand in October. Sh