## Text Summary with tf-ifd

### This exercise serves as an example to one of the usecases of tf-idf, which is summarizing a text document

In [1]:
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

#### import our text and tokenize the sentences
#### Note that in this exercise, we are tokenizing by sentence, and not words. This is to ensure a meaningful output.

In [2]:
text = open('news.txt','r').read()
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)
print(sentences)

11
['British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.', 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.', 'Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April.', 'Tourists and business people are cutting back on foreign travel.', 'There could be a "very significant expansion" of the number of cases of coronavirus in the UK, Prime Minister Boris Johnson has warned.', 'Ryanair boss Michael O\'Leary said: "Our focus at this time is on minimising any risk to our people and our passengers.', '"While we are heavily booked over the next two weeks, there has been a notable drop in forward bookings towards the end of March, into early April.', '"It makes sense to selectively prune our schedule to and from those airports where travel has been most affected by the Covid-

### Create the Frequency matrix of the words in each sentence

In [3]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        # counts frequencies of words in every sentence
        freq_table = {}
        words = word_tokenize(sent)

        for word in words:
            #stopwords removal
            if word in stopWords or len(word) < 2 or not word.isalpha():
                continue
            # stemming and lowercasing
            word = ps.stem(word.lower())
            if word not in freq_table:
                freq_table[word] = 0
            freq_table[word] += 1
        
        #save frequencies as sent -> word_frequencies
        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [4]:
frequency_matrix = _create_frequency_matrix(sentences)
frequency_matrix

{'British Airways': {'british': 1,
  'airway': 1,
  'said': 1,
  'would': 1,
  'contact': 1,
  'custom': 1,
  'cancel': 1,
  'flight': 2,
  'offer': 1,
  'rebook': 1,
  'carrier': 1,
  'refund': 1,
  'anoth': 1,
  'ba': 1,
  'later': 1,
  'date': 1,
  'travel': 1},
 'BA is cancellin': {'ba': 1,
  'cancel': 1,
  'flight': 1,
  'march': 1,
  'london': 1,
  'destin': 1,
  'includ': 1,
  'new': 1,
  'york': 1,
  'itali': 1,
  'franc': 1,
  'austria': 1,
  'belgium': 1,
  'germani': 1,
  'ireland': 1},
 'Ryanair will cu': {'ryanair': 1,
  'cut': 1,
  'flight': 1,
  'itali': 1,
  'march': 1,
  'april': 1},
 'Tourists and bu': {'tourist': 1,
  'busi': 1,
  'peopl': 1,
  'cut': 1,
  'back': 1,
  'foreign': 1,
  'travel': 1},
 'There could be ': {'there': 1,
  'could': 1,
  'signific': 1,
  'expans': 1,
  'number': 1,
  'case': 1,
  'coronaviru': 1,
  'uk': 1,
  'prime': 1,
  'minist': 1,
  'bori': 1,
  'johnson': 1,
  'warn': 1},
 'Ryanair boss Mi': {'ryanair': 1,
  'boss': 1,
  'michael': 1,


### Calculate TermFrequency for each word

In [5]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [6]:
tf_matrix = _create_tf_matrix(frequency_matrix)
tf_matrix

{'British Airways': {'british': 0.058823529411764705,
  'airway': 0.058823529411764705,
  'said': 0.058823529411764705,
  'would': 0.058823529411764705,
  'contact': 0.058823529411764705,
  'custom': 0.058823529411764705,
  'cancel': 0.058823529411764705,
  'flight': 0.11764705882352941,
  'offer': 0.058823529411764705,
  'rebook': 0.058823529411764705,
  'carrier': 0.058823529411764705,
  'refund': 0.058823529411764705,
  'anoth': 0.058823529411764705,
  'ba': 0.058823529411764705,
  'later': 0.058823529411764705,
  'date': 0.058823529411764705,
  'travel': 0.058823529411764705},
 'BA is cancellin': {'ba': 0.06666666666666667,
  'cancel': 0.06666666666666667,
  'flight': 0.06666666666666667,
  'march': 0.06666666666666667,
  'london': 0.06666666666666667,
  'destin': 0.06666666666666667,
  'includ': 0.06666666666666667,
  'new': 0.06666666666666667,
  'york': 0.06666666666666667,
  'itali': 0.06666666666666667,
  'franc': 0.06666666666666667,
  'austria': 0.06666666666666667,
  'belgi

### Counting how many sentences (documents) contain a specific word

In [7]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [8]:
documents_per_words = _create_documents_per_words(frequency_matrix)
documents_per_words

{'british': 1,
 'airway': 1,
 'said': 3,
 'would': 3,
 'contact': 1,
 'custom': 1,
 'cancel': 2,
 'flight': 4,
 'offer': 1,
 'rebook': 1,
 'carrier': 1,
 'refund': 1,
 'anoth': 1,
 'ba': 2,
 'later': 1,
 'date': 1,
 'travel': 3,
 'march': 4,
 'london': 1,
 'destin': 1,
 'includ': 1,
 'new': 1,
 'york': 1,
 'itali': 2,
 'franc': 1,
 'austria': 1,
 'belgium': 1,
 'germani': 1,
 'ireland': 1,
 'ryanair': 2,
 'cut': 2,
 'april': 2,
 'tourist': 1,
 'busi': 1,
 'peopl': 2,
 'back': 1,
 'foreign': 1,
 'there': 1,
 'could': 1,
 'signific': 1,
 'expans': 1,
 'number': 1,
 'case': 1,
 'coronaviru': 1,
 'uk': 1,
 'prime': 1,
 'minist': 1,
 'bori': 1,
 'johnson': 1,
 'warn': 1,
 'boss': 1,
 'michael': 1,
 'our': 1,
 'focu': 1,
 'time': 1,
 'minimis': 1,
 'risk': 1,
 'passeng': 2,
 'while': 1,
 'heavili': 1,
 'book': 1,
 'next': 1,
 'two': 1,
 'week': 1,
 'notabl': 1,
 'drop': 1,
 'forward': 1,
 'toward': 1,
 'end': 2,
 'earli': 1,
 'it': 1,
 'make': 1,
 'sens': 1,
 'select': 1,
 'prune': 1,
 'sche

### Calculate IDF

In [9]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [10]:
idf_matrix = _create_idf_matrix(frequency_matrix, documents_per_words, total_documents)
idf_matrix

{'British Airways': {'british': 1.0413926851582251,
  'airway': 1.0413926851582251,
  'said': 0.5642714304385625,
  'would': 0.5642714304385625,
  'contact': 1.0413926851582251,
  'custom': 1.0413926851582251,
  'cancel': 0.7403626894942439,
  'flight': 0.43933269383026263,
  'offer': 1.0413926851582251,
  'rebook': 1.0413926851582251,
  'carrier': 1.0413926851582251,
  'refund': 1.0413926851582251,
  'anoth': 1.0413926851582251,
  'ba': 0.7403626894942439,
  'later': 1.0413926851582251,
  'date': 1.0413926851582251,
  'travel': 0.5642714304385625},
 'BA is cancellin': {'ba': 0.7403626894942439,
  'cancel': 0.7403626894942439,
  'flight': 0.43933269383026263,
  'march': 0.43933269383026263,
  'london': 1.0413926851582251,
  'destin': 1.0413926851582251,
  'includ': 1.0413926851582251,
  'new': 1.0413926851582251,
  'york': 1.0413926851582251,
  'itali': 0.7403626894942439,
  'franc': 1.0413926851582251,
  'austria': 1.0413926851582251,
  'belgium': 1.0413926851582251,
  'germani': 1.04

### Calculate TF-IDF

In [11]:

def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [12]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
tf_idf_matrix

{'British Airways': {'british': 0.06125839324460148,
  'airway': 0.06125839324460148,
  'said': 0.03319243708462132,
  'would': 0.03319243708462132,
  'contact': 0.06125839324460148,
  'custom': 0.06125839324460148,
  'cancel': 0.043550746440837874,
  'flight': 0.051686199274148546,
  'offer': 0.06125839324460148,
  'rebook': 0.06125839324460148,
  'carrier': 0.06125839324460148,
  'refund': 0.06125839324460148,
  'anoth': 0.06125839324460148,
  'ba': 0.043550746440837874,
  'later': 0.06125839324460148,
  'date': 0.06125839324460148,
  'travel': 0.03319243708462132},
 'BA is cancellin': {'ba': 0.049357512632949595,
  'cancel': 0.049357512632949595,
  'flight': 0.02928884625535084,
  'march': 0.02928884625535084,
  'london': 0.06942617901054834,
  'destin': 0.06942617901054834,
  'includ': 0.06942617901054834,
  'new': 0.06942617901054834,
  'york': 0.06942617901054834,
  'itali': 0.049357512632949595,
  'franc': 0.06942617901054834,
  'austria': 0.06942617901054834,
  'belgium': 0.069

### Score the sentences

In [13]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word
    in a sentence divided by total no of words in a sentence.
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [14]:
sentences_score = _score_sentences(tf_idf_matrix)
sentences_score

{'British Airways': 0.053659254652959086,
 'BA is cancellin': 0.06006080136766894,
 'Ryanair will cu': 0.10666989293437501,
 'Tourists and bu': 0.12674627653183573,
 'There could be ': 0.08010712962755576,
 'Ryanair boss Mi': 0.08326535781015594,
 '"While we are h': 0.07355495048325231,
 '"It makes sense': 0.09459684342142927,
 'The firm declin': 0.09278026753077477,
 'However, it sai': 0.0807947310043132}

### Find the threshold 

In [15]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [16]:
average_score = _find_average_score(sentences_score)
average_score

0.085223550536432

#### Select a sentence for a summarization if the sentence score is more than the average score

In [17]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [19]:
_generate_summary(sentences, sentences_score, 1*average_score)

' Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April. Tourists and business people are cutting back on foreign travel. "It makes sense to selectively prune our schedule to and from those airports where travel has been most affected by the Covid-19 outbreak." The firm declined to say how many flights or passengers would be affected.'

### Calling Code

In [None]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords    
    
'''
We already have a sentence tokenizer, so we just need 
to run the sent_tokenize() method to create the array of sentences.
'''
# 1 Sentence Tokenize
sentences = sent_tokenize(text)
total_documents = len(sentences)
#print(sentences)

# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
#print(freq_matrix)

'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)

# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)

'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)

# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)

# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)

# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)

# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary)


In [69]:
!pip install sklearn



You should consider upgrading via the 'python -m pip install --upgrade pip' command.
