In [1]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#importing all necessary libraries
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import reuters
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk.data
import math
import re
import nltk
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge 
import seaborn as sns
import matplotlib.pyplot as plt
import string
#nltk.download('all')

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
summary_threshold = 5  # number of sentences in final summary
stop_words = stopwords.words('english') #getting all the stopwords
stemmer = PorterStemmer()  #using PorterStemmer for stemming purposes

In [6]:
class Summarizer():
    #taking in articles and extracting the headline and body to place them in a list
    def __init__(self, article): #constructor
      self._articles = []
      for doc in article:
            with open(doc) as f:
                headline = f.readline() #first line of article is the headline
                url = f.readline() #second line of article is the url
                body = f.read().replace('\n', ' ') #read the remaining of the article and replace the empty lines with a whitespace
                #if headline and body is not empty, then we assign the values into 'articles' list
                if not self.valid_input(headline, body):
                    self._articles.append((None, None))
                    continue
                self._articles.append((headline, body))
 
    #check if headline and body has any text or not
    def valid_input(self, headline, article_text):
        return headline != '' and article_text != ''

    #perform tokenization and stemming (using PorterStemmer)  
    def tokenize_and_stem(self, text):
        tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered = []
        #filter out numeric tokens, raw punctuation, etc.
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered.append(token)
        stems = [stemmer.stem(t) for t in filtered]
        return stems

    #building tf_idf score in a matrix for each word in the Reuters corpus
    def build_TFIDF_model(self):
        token_dict = {}
        #getting the name of the article
        for article in reuters.fileids():
            token_dict[article] = reuters.raw(article) 
        # Use TF-IDF to determine frequency of each word in our article, relative to the word frequency distributions in corpus of 10.8k Reuters news articles.
        self._tfidf = TfidfVectorizer(tokenizer=self.tokenize_and_stem, stop_words='english', decode_error='ignore')
        tdm = self._tfidf.fit_transform(token_dict.values())

    #extracting each sentence from the article
    def split_into_sentences(self, text):
        #identify where each sentence ends
        tok = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = tok.tokenize(self.remove_smart_quotes(text))
        #consider sentences with a length of more than 10
        sentences = [sent.replace('\n', '') for sent in sentences if len(sent) > 10]
        return sentences

    #function for generating the summary
    def generate_summaries(self):
        #Identifying the number of sentences
        total_num_sentences = 0
        for article in self._articles:
            total_num_sentences += len(self.split_into_sentences(article[1]))
        
        #If article is shorter than the desired threshold, return the original articles
        if total_num_sentences <= summary_threshold:
            return [x[1] for x in self._articles]

        self.build_TFIDF_model()  #build tf-idf score matrix from Reuters corpus
        self._scores = Counter()
        for article in self._articles:
            self.score(article)
        highest_scoring = self._scores.most_common(summary_threshold)

        # Appending highest scoring "representative" sentences, returns as a single summary paragraph and save to text file
        with open("Summary.txt", "w") as fwrite:
          fwrite.write("Headline: \n")
          for article in self._articles:
            fwrite.write(article[0])
          fwrite.write("\nSummary: \n")
          fwrite.write(' '.join([sent[0] for sent in highest_scoring])) 
        return ' '.join([sent[0] for sent in highest_scoring])
    
    #remove smart quotes
    def remove_smart_quotes(self, text):
        return text.replace(u"\u201c","").replace(u"\u201d", "")

    #this function is to assign the sentences in the summary based on its relevance to headline, length, sentence position, word frequencies
    def score(self, article):
        headline = article[0]
        sentences = self.split_into_sentences(article[1])
        frequency_scores = self.frequency_scores(article[1])
        for i, s in enumerate(sentences):
            headline_score = self.headline_score(headline, s) * 1.5
            length_score = self.length_score(self.get_tokens(s)) * 1.0
            position_score = self.position_score(float(i+1), len(sentences)) * 1.0
            frequency_score = frequency_scores[i] * 4
            score = (headline_score + frequency_score + length_score + position_score) / 4.0
            self._scores[s] = score

    #in this function sentences are scored as the sum of their TF-IDF word frequencies.    
    def frequency_scores(self, article_text):
        # Add our document into the model so we can retrieve scores
        response = self._tfidf.transform([article_text])
        feature_names = self._tfidf.get_feature_names() # these are just stemmed words
        word_prob = {}  # TF-IDF individual word probabilities
        for col in response.nonzero()[1]:
            word_prob[feature_names[col]] = response[0, col]

        #taking each sentence score based on its word probability
        sent_scores = []
        for sentence in self.split_into_sentences(article_text):
            score = 0
            sent_tokens = self.tokenize_and_stem(sentence)
            for token in (t for t in sent_tokens if t in word_prob):
                score += word_prob[token]

            # Normalize score by length of sentence, since we later factor in sentence length as a feature
            sent_scores.append(score / len(sent_tokens))
        return sent_scores

    #this function gives sentence a score between 0 to 1 based on percentage of words common to the headline and the article
    def headline_score(self, headline, sentence):
        title_stems = [stemmer.stem(w) for w in headline if w not in stop_words]
        sentence_stems = [stemmer.stem(w) for w in sentence if w not in stop_words]
        count = 0.0
        for word in sentence_stems:
            if word in title_stems:
                count += 1.0
        score = count / len(title_stems)
        return score
  
    #this function gives sentence score between (0,1) based on how close sentence's length is to the ideal length. 
    def length_score(self, sentence):
        len_diff = math.fabs(20 - len(sentence))
        return len_diff/20
      
      
    #split each sentences into tokens
    def get_tokens(self,text):
      lowers = text.lower()
      no_punctuation = lowers.translate(string.punctuation)
      tokens = nltk.word_tokenize(no_punctuation)
      return tokens

    #this function gives a value between (0,1), corresponding to sentence's position in the article.
    #Assuming that sentences at the very beginning and end of the article have a higher weight. 
    def position_score(self, i, size):
        sent_position = i / size
        if 0 < sent_position <= 0.1:
            return 0.17
        elif 0.1 < sent_position <= 0.2:
            return 0.23
        elif 0.2 < sent_position <= 0.3:
            return 0.14
        elif 0.3 < sent_position <= 0.4:
            return 0.08
        elif 0.4 < sent_position <= 0.5:
            return 0.05
        elif 0.5 < sent_position <= 0.6:
            return 0.04
        elif 0.6 < sent_position <= 0.7:
            return 0.06
        elif 0.7 < sent_position <= 0.8:
            return 0.04
        elif 0.8 < sent_position <= 0.9:
            return 0.04
        elif 0.9 < sent_position <= 1.0:
            return 0.15
        else:
            return 0  

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...


True

In [11]:
import os
os.chdir('/content/drive/MyDrive/nlp project/')
article = ["article1.txt"] #pushing article into a list
summarized_article = Summarizer(article) #Passing the article into the 'Summarizer' class
s = summarized_article.generate_summaries() #Print the summary
print(s)

LookupError: ignored