### I am going to use TF-IDF to find the most important sentences

### TF is abbreviation for Term Frequency which is the frequency of a word appearing in a document divided by the total number of all words in the document

<img src="tf-nlp.png">

### Lets say we are reading an article about Machine Learning and/or Deep Learning, so these 4 words will be more commonly found in the document

### Quite often, we would want to build a dictionary (hashmap) of term frequencies alongside the term. Like {word: term frequency of that word} and then iterate through this dictionary to find out which word appears the most times.

### Preprocessing
#### -> Removal of stopwords - Stopwords are the words such as "and", "the", etc. which are present in the document but will make very less impact on the task. Hence, we will have to remove it.
#### -> Case of the words - Now, two words, "Machine", and "machine" are same words but due to the case difference these will be treated as two different words, hence we will convert them to only one case, either lower or upper.
#### -> Removing comma, fullstops, or any punctuations

In [2]:
import nltk
import string
from nltk import word_tokenize
import re
from nltk.corpus import stopwords
import math
from PIL import Image
import sys
import pyocr
import pyocr.builders
import cv2

In [61]:
def preprocess(doc):
    doc = doc.lower()
    doc = " ".join([word for word in doc.split(" ") if word not in stopwords.words('english')])
    doc = doc.replace("\n\n", " ").replace("\n", " ").replace("-", "").replace('"', "")
    temp_sentences = doc.strip().split(".")
    sentences = []
    for sentence in temp_sentences:
        sentences.append("".join([char for char in sentence if char not in string.punctuation]).strip())
    doc = "".join([char for char in doc if char not in string.punctuation])
    return doc, sentences

In [62]:
def getTextFromImage(filename):
    doc = tool.image_to_string(
        Image.open(filename),
        lang=lang,
        builder=pyocr.builders.TextBuilder()
    )
    # txt is a Python string

    word_boxes = tool.image_to_string(
        Image.open(filename),
        lang="eng",
        builder=pyocr.builders.WordBoxBuilder()
    )
    return doc, word_boxes

In [63]:
def countWords(doc):
    freq = {}
    for word in doc.split(" "):
        if word not in freq:
            freq[word] = 1
        else:
            freq[word] += 1
    return freq

In [64]:
def termFrequency(doc):
    #Get the data in the required format and the sentences
    clean_doc, sentences = preprocess(doc)
    unique_words = len(set(clean_doc.split(" ")))
        
    #Get the frequency dictionary
    frequency = countWords(clean_doc)
    
    #Calculate TF according to the formula
    tf_dict = {}
    for word in frequency:
        if word not in tf_dict:
            tf_dict[word] = frequency[word]/unique_words
    
    #Get the TF value for the whole sentence
    sentence_tf = {}
    i = 0
    for sentence in sentences:
        tf_sum = 0
        for word in sentence.split(" "):
            tf_sum += tf_dict[word]
        sentence_tf[sentence] = (tf_sum, i)
        i += 1
    return sentence_tf

### We want a sentence that is both rare, unique and contains keywords common in the article. This is where inverse document frequency comes in.

<img src="idf-nlp.png">

### IDF used over many documents, whereas TF is built for one document. We can decide what a document is. In this piece of text, each sentence is its own document.

In [65]:
def inverseDocumentFrequency(doc):
    #Get the data in the required format and the sentences
    clean_doc, sentences = preprocess(doc)
    unique_words = len(set(clean_doc.split(" ")))
    
    #Since we are treating each sentence as a separate document, the number of documents will be the length of sentences array
    numDocs = len(sentences)
    
    frequency = countWords(clean_doc)
    
    #We have to count the words in each sentence, since each sentence is a document
    wordCountSentences = []
    for sentence in sentences:
        wordCountSentences.append(countWords(sentence))
    
    #Getting the inverse document frequency
    IDF = {}
    for word in list(set(clean_doc.split(" "))):
        temp_count = 0
        for sentence in wordCountSentences:
            if word in sentence:
                temp_count += 1
        IDF[word] = math.log10(len(sentences)/temp_count)
        
    #Getting IDF values for sentences
    IDF_sentences = {}
    i = 0
    for i in range(len(sentences)):
        words = sentences[i].split(" ")
        temp_add = 0.0
        words_no_stop_words = preprocess(sentences[i])
        for word in words:
            if word.lower() in IDF:
                temp_add += IDF[word.lower()]
        IDF_sentences[sentences[i]] = (temp_add/len(words_no_stop_words), i)
        i += 1
    
    return IDF_sentences

In [66]:
def tfidf(doc):
    TF = termFrequency(doc)
    IDF = inverseDocumentFrequency(doc)
    
    TFIDF_dict = {}
    for key in TF:
        if key in IDF:
            TFIDF_dict[key] = (TF[key][0] * IDF[key][0], TF[key][1])
                    
    max1 = 0.0;
    max2 = 0.0;
    max3 = 0.0;

    max1Sent = "";
    max2Sent = "";
    max3Sent = "";
    
    sorted_TFIDF = sorted(TFIDF_dict.items(), key=lambda e: e[1], reverse=True)
    
    #finding the top 3 sentences in TFidfDict
    '''max_sentences = []
    for (key, value) in TFIDF_dict.items():
        if TFIDF_dict[key][0] > max1:
            max1 = TFIDF_dict[key][0]
            max1Sent = key;
            max_sentences.append((max1Sent, TFIDF_dict[key][1], TFIDF_dict[key][0]))
        elif TFIDF_dict[key][0] > max2 and TFIDF_dict[key][0] < max1:
            max2 = TFIDF_dict[key][0]
            max2Sent = key;
            max_sentences.append((max2Sent, TFIDF_dict[key][1], TFIDF_dict[key][0]))
        elif TFIDF_dict[key][0] > max3 and TFIDF_dict[key][0] < max2 and TFIDF_dict[key][0] < max1:
            max3 = TFIDF_dict[key][0]
            max3Sent = key;
            max_sentences.append((max3Sent, TFIDF_dict[key][1], TFIDF_dict[key][0]))'''
        
    #return max_sentences
    return sorted_TFIDF

In [89]:
def highlight_words(positions, img):

    overlay = img.copy()

    img = cv2.imread('harrypotter-p1.jpg')
    
    for i in range(len(positions)):
        ((x, y), (w, h)) = positions[i].position
        cv2.rectangle(overlay, (x, y), (w, h), (255, 0, 0), -1)
    alpha = 0.4  # Transparency factor.
    img_new = cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0)
    r = 600.0 / img_new.shape[1]  # resizing image without loosing aspect ratio
    dim = (600, int(img_new.shape[0] * r))
    # perform the actual resizing of the image and show it
    resized = cv2.resize(img_new, dim, interpolation=cv2.INTER_AREA)
    cv2.imshow('img', resized)
    cv2.imwrite("highlighted-harrypotter-p1.jpg", resized)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [90]:
def main():
    #Get data from image
    doc, word_boxes = getTextFromImage('harrypotter-p1.jpg')
    doc = doc.replace("\n\n", " ").replace("\n", " ").replace("Mr.", "Mister").replace("Mrs.", "Missus")
    
    #Get tfidf dictionary
    TFIDF = tfidf(doc)
    
    #Get top n sentences, where n is len(all_sentences)//3
    n = len(TFIDF)//3
    sentences = []
    for sentence in TFIDF:
        sentences.append(sentence[1][-1])
    top_n = sentences[:n]
    top_n.sort()

    all_sentences = doc.strip().split(". ")
    
    #Get starting and ending indices to make it easy to get the coordinates
    k = 0
    indices = []
    for i in range(len(all_sentences)):
        indices.append((k, k + len(all_sentences[i].split(" "))))
        k += len(all_sentences[i].split(" "))
        
    #Get the positions for all the words in the important sentences list
    positions = []
    for i in range(len(top_n)):
        positions += word_boxes[indices[top_n[i]][0]:indices[top_n[i]][1]]
        
    img = cv2.imread('harrypotter-p1.jpg')
    highlight_words(positions, img)

In [91]:
main()