In [47]:
# import libraries
import math
import nltk
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [48]:
# read files
#f = open('Article.txt')
#text = f.read()
f = open('Reviews.txt', encoding = "ISO-8859-1")
text = f.read()
text



### Tokenize the sentences

In [49]:
sentences = nltk.sent_tokenize(text) # NLTK function
total_documents = len(sentences)
sentences
#total_documents

['Wow...',
 'Loved this place.',
 'Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.',
 'Service was very prompt.',
 'Would not go back.',
 'The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.',
 'I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!',
 'I was disgusted because I was pretty sure that was human hair.',
 'I was shocked because no signs indicate cash only.',
 'Highly recommended.',
 'Waitress was a little slow in service.',
 'This place is not worth your time, let alone Vegas.',
 'did not like at 

### Create the Frequency matrix of the words in each sentence

In [50]:
# calculate the frequency of words in each sentence
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [51]:
freq_matrix = _create_frequency_matrix(sentences)
freq_matrix

{'Wow...': {'wow': 1, '...': 1},
 'Loved this plac': {'love': 1, 'thi': 1, 'place': 1, '.': 1},
 'Crust is not go': {'crust': 1, 'good': 1, '.': 1},
 'Not tasty and t': {'tasti': 1, 'textur': 1, 'wa': 1, 'nasti': 1, '.': 1},
 'Stopped by duri': {'stop': 1,
  'dure': 1,
  'late': 1,
  'may': 1,
  'bank': 1,
  'holiday': 1,
  'rick': 1,
  'steve': 1,
  'recommend': 1,
  'love': 1,
  '.': 1},
 'The selection o': {'select': 1, 'food': 1, 'wa': 1, 'best': 1, '.': 1},
 'Now I am gettin': {'get': 1,
  'angri': 1,
  'want': 1,
  'damn': 1,
  'pho': 1,
  '.': 1},
 'Honeslty it did': {'honeslti': 1,
  "n't": 1,
  'tast': 1,
  'fresh': 1,
  '.': 1,
  ')': 1},
 'The potatoes we': {'potato': 1,
  'like': 1,
  'rubber': 1,
  'could': 1,
  'tell': 1,
  'made': 1,
  'ahead': 1,
  'time': 1,
  'kept': 1,
  'warmer': 1,
  '.': 1},
 'The fries were ': {'fri': 1,
  'hot': 1,
  ',': 1,
  'neither': 1,
  'wa': 1,
  'burger': 1,
  '.': 1},
 'A great touch.': {'great': 1, 'touch': 1, '.': 1},
 'Service was ve

### Calculate TermFrequency and generate a matrix
#### TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

In [52]:
# the document is a paragraph, the term is a word in a paragraph
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [53]:
tf_matrix = _create_tf_matrix(freq_matrix)
tf_matrix

{'Wow...': {'wow': 0.5, '...': 0.5},
 'Loved this plac': {'love': 0.25, 'thi': 0.25, 'place': 0.25, '.': 0.25},
 'Crust is not go': {'crust': 0.3333333333333333,
  'good': 0.3333333333333333,
  '.': 0.3333333333333333},
 'Not tasty and t': {'tasti': 0.2,
  'textur': 0.2,
  'wa': 0.2,
  'nasti': 0.2,
  '.': 0.2},
 'Stopped by duri': {'stop': 0.09090909090909091,
  'dure': 0.09090909090909091,
  'late': 0.09090909090909091,
  'may': 0.09090909090909091,
  'bank': 0.09090909090909091,
  'holiday': 0.09090909090909091,
  'rick': 0.09090909090909091,
  'steve': 0.09090909090909091,
  'recommend': 0.09090909090909091,
  'love': 0.09090909090909091,
  '.': 0.09090909090909091},
 'The selection o': {'select': 0.2,
  'food': 0.2,
  'wa': 0.2,
  'best': 0.2,
  '.': 0.2},
 'Now I am gettin': {'get': 0.16666666666666666,
  'angri': 0.16666666666666666,
  'want': 0.16666666666666666,
  'damn': 0.16666666666666666,
  'pho': 0.16666666666666666,
  '.': 0.16666666666666666},
 'Honeslty it did': {'hone

### Creating a table for documents per words (IDF)

In [54]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [55]:
count_doc_per_words = _create_documents_per_words(freq_matrix)
count_doc_per_words

{'wow': 3,
 '...': 27,
 'love': 30,
 'thi': 122,
 'place': 96,
 '.': 768,
 'crust': 2,
 'good': 85,
 'tasti': 10,
 'textur': 3,
 'wa': 242,
 'nasti': 3,
 'stop': 4,
 'dure': 5,
 'late': 2,
 'may': 5,
 'bank': 1,
 'holiday': 1,
 'rick': 1,
 'steve': 1,
 'recommend': 16,
 'select': 10,
 'food': 111,
 'best': 26,
 'get': 26,
 'angri': 1,
 'want': 16,
 'damn': 4,
 'pho': 4,
 'honeslti': 1,
 "n't": 88,
 'tast': 21,
 'fresh': 12,
 ')': 18,
 'potato': 8,
 'like': 47,
 'rubber': 1,
 'could': 18,
 'tell': 6,
 'made': 17,
 'ahead': 1,
 'time': 50,
 'kept': 5,
 'warmer': 1,
 'fri': 13,
 'hot': 8,
 ',': 271,
 'neither': 1,
 'burger': 15,
 'great': 56,
 'touch': 2,
 'servic': 67,
 'veri': 66,
 'prompt': 1,
 'would': 31,
 'go': 58,
 'back': 55,
 'cashier': 2,
 'care': 4,
 'ever': 28,
 'say': 17,
 'still': 11,
 'end': 3,
 'wayyy': 1,
 'overpr': 6,
 'tri': 17,
 'cape': 1,
 'cod': 1,
 'ravoli': 1,
 'chicken': 15,
 'cranberri': 1,
 'mmmm': 2,
 '!': 159,
 'disgust': 4,
 'becaus': 12,
 'pretti': 19,
 'sur

### Calculate IDF and generate a matrix
#### IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

In [56]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [57]:
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
idf_matrix

{'Wow...': {'wow': 2.5237464668115646, '...': 1.5695039573722396},
 'Loved this plac': {'love': 1.5237464668115646,
  'thi': 0.9145078908564787,
  'place': 1.0185964884916585,
  '.': 0.11550650149971492},
 'Crust is not go': {'crust': 2.699837725867246,
  'good': 1.0714487958169343,
  '.': 0.11550650149971492},
 'Not tasty and t': {'tasti': 2.0008677215312267,
  'textur': 2.5237464668115646,
  'wa': 0.6170523555507956,
  'nasti': 2.5237464668115646,
  '.': 0.11550650149971492},
 'Stopped by duri': {'stop': 2.3988077302032647,
  'dure': 2.301897717195208,
  'late': 2.699837725867246,
  'may': 2.301897717195208,
  'bank': 3.0008677215312267,
  'holiday': 3.0008677215312267,
  'rick': 3.0008677215312267,
  'steve': 3.0008677215312267,
  'recommend': 1.7967477388753021,
  'love': 1.5237464668115646,
  '.': 0.11550650149971492},
 'The selection o': {'select': 2.0008677215312267,
  'food': 0.9555447427445695,
  'wa': 0.6170523555507956,
  'best': 1.585894373560409,
  '.': 0.11550650149971492

### Calculate TF-IDF and generate a matrix
#### TF-IDF algorithm is made of 2 algorithms multiplied together

In [58]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [59]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
tf_idf_matrix

{'Wow...': {'wow': 1.2618732334057823, '...': 0.7847519786861198},
 'Loved this plac': {'love': 0.38093661670289114,
  'thi': 0.22862697271411966,
  'place': 0.25464912212291463,
  '.': 0.02887662537492873},
 'Crust is not go': {'crust': 0.8999459086224153,
  'good': 0.35714959860564477,
  '.': 0.038502167166571635},
 'Not tasty and t': {'tasti': 0.40017354430624535,
  'textur': 0.5047492933623129,
  'wa': 0.12341047111015913,
  'nasti': 0.5047492933623129,
  '.': 0.023101300299942985},
 'Stopped by duri': {'stop': 0.21807343001847862,
  'dure': 0.209263428835928,
  'late': 0.24543979326065873,
  'may': 0.209263428835928,
  'bank': 0.2728061565028388,
  'holiday': 0.2728061565028388,
  'rick': 0.2728061565028388,
  'steve': 0.2728061565028388,
  'recommend': 0.1633407035341184,
  'love': 0.1385224060737786,
  '.': 0.01050059104542863},
 'The selection o': {'select': 0.40017354430624535,
  'food': 0.19110894854891392,
  'wa': 0.12341047111015913,
  'best': 0.31717887471208184,
  '.': 0.

### Score the sentences

In [76]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [77]:
sentence_scores = _score_sentences(tf_idf_matrix)
sentence_scores

{'Wow...': 1.023312606045951,
 'Loved this plac': 0.22327233422871354,
 'Crust is not go': 0.43186589146487725,
 'Not tasty and t': 0.31123678048819464,
 'Stopped by duri': 0.207784400692334,
 'The selection o': 0.21099462779546863,
 'Now I am gettin': 0.31379532766314394,
 'Honeslty it did': 0.26440803864224527,
 'The potatoes we': 0.1808858114574137,
 'The fries were ': 0.20634293011179136,
 'A great touch.': 0.4520026579879986,
 'Service was ver': 0.24358173133605984,
 'Would not go ba': 0.25768483057512137,
 'The cashier had': 0.22524797708364894,
 'I tried the Cap': 0.2180327529051731,
 'I was disgusted': 0.2422925793857991,
 'I was shocked b': 0.24993478294544802,
 'Highly recommen': 0.4790068856198091,
 'Waitress was a ': 0.22656588559949128,
 'This place is n': 0.2042505740211322,
 'did not like at': 0.3610690912738061,
 'The Burrittos B': 0.7556895600303588,
 'The food, amazi': 0.20488059752592122,
 'Service is also': 0.3474775689731,
 'I could care le': 0.5446734140959157,
 '

### Find the threshold

In [78]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [82]:
threshold = _find_average_score(sentence_scores)
threshold

0.27217012541481617

### Generate the summary
#### Algorithm: Select a sentence for a summarization if the sentence score is more than the average score.


In [65]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [102]:
summary = _generate_summary(sentences, sentence_scores, 5 * threshold)
summary

' The chips and salsa were really good, the salsa was very fresh. The chips and sals a here is amazing!!!!!!!!!!!!!!!!!!!'