In [1]:
# Implementation of tf-idf
import nltk

In [2]:
file_id = nltk.corpus.state_union.fileids()[0]
text = nltk.corpus.state_union.raw(file_id)

print(text[:200])

PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS
 
April 16, 1945

Mr. Speaker, Mr. President, Members of the Congress:
It is with a heavy heart that I stand before you, my f


In [3]:
from idf import computeWordIdf
from util import preprocess_sotu_text

documents = map(nltk.corpus.state_union.raw, nltk.corpus.state_union.fileids())
processed_documents = map(preprocess_sotu_text, documents)
word_idf = computeWordIdf(processed_documents)
print(word_idf['harry'])

1.466337068793427


In [4]:
from centroid import CentroidScorer

documents = map(nltk.corpus.state_union.raw, nltk.corpus.state_union.fileids())
processed_documents = map(preprocess_sotu_text, documents)

scorer = CentroidScorer(1.5)
scorer.fit(processed_documents)

test_fileid = nltk.corpus.state_union.fileids()[0]
test_text = nltk.corpus.state_union.raw(test_fileid)
processed_text = preprocess_sotu_text(test_text)

sentence_with_scores = scorer.score(processed_text)

from operator import itemgetter
sorted_sentence_with_scores = sorted(sentence_with_scores, key=itemgetter('score'), reverse=True)
for sentence_with_score in sorted_sentence_with_scores[:20]:
    print(sentence_with_score['text'])

As I have assumed my heavy duties, I humbly pray Almighty God, in the words of King Solomon:
"Give therefore thy servant an understanding heart to judge thy people, that I may discern between good and bad; for who is able to judge this thy so great a people?"
We are now carrying out our part of that strategy under the able direction of Admiral Leahy, General Marshall, A dmiral King, General Arnold, General Eisenhower, Admiral Nimitz and General MacArthur.
We shall need also an abiding faith in the people, the kind of faith and courage which Franklin Delano Roosevelt always had!
So much blood has already been shed for the ideals which we cherish, and for which Franklin Delano Roosevelt lived and died, that we dare not permit even a momentary pause in the hard fight for victory.
In the memory of those who have made the supreme sacrifice-in the memory of our fallen President-we shall not fail!
Having to pay such a heavy price to make complete victory certain, America will never become a p

In [39]:
# Computing the idf-modified cosine

import nltk.data
from nltk.tokenize.nist import NISTTokenizer
from idf import tfIdf

punkt_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

test_fileid = nltk.corpus.state_union.fileids()[0]
test_text = nltk.corpus.state_union.raw(test_fileid)
processed_text = preprocess_sotu_text(test_text)

sentences = punkt_tokenizer.tokenize(processed_text)
nist = NISTTokenizer()
sentence1_tfidf = tfIdf(word_idf, nist.tokenize(sentences[0], lowercase=True))
sentence2_tfidf = tfIdf(word_idf, nist.tokenize(sentences[1], lowercase=True))


In [40]:
print(sentence1_tfidf)
print(sentence2_tfidf)

from math import sqrt

def norm(vals):
    return sqrt(sum([x*x for x in vals]))

def idfCosine(sent1_tfidf, sent2_tfidf):
    sent1_norm = norm(sent1_tfidf.values())
    sent2_norm = norm(sent2_tfidf.values())
    if sent1_norm == 0. or sent2_norm == 0.:
        return 0.
    inner_product = sum(sent1_tfidf[word]*sent2_tfidf.get(word, 0.) for word in sent1_tfidf.keys())
    cosine = inner_product/(sent1_norm*sent2_norm)
    
    return cosine

print(idfCosine(sentence1_tfidf, sentence2_tfidf))

defaultdict(<class 'float'>, {'president': 0.0, 'harry': 1.466337068793427, 's': 1.0833448165373212, '.': 0.0, "truman's": 2.094945728215801, 'address': 0.39019763597737595, 'before': 0.06250508700820906, 'a': 0.0, 'joint': 0.30318625898774626, 'session': 0.18540322333136275, 'of': 0.0, 'the': 0.0, 'congress': 0.0, 'april': 1.6094379124341003, '16': 1.341173925839421, ',': 0.0, '1945': 2.228477120840324, 'mr': 0.3708064466627255, 'speaker': 0.20409535634351522, 'members': 0.03125254350410453, ':': 0.0, 'it': 0.0, 'is': 0.0, 'with': 0.0, 'heavy': 1.1786549963416462, 'heart': 0.6480267452794758, 'that': 0.0, 'i': 0.0, 'stand': 0.28256697178501045, 'you': 0.06351340572232593, 'my': 0.015504186535965254, 'friends': 0.22314355131420976, 'and': 0.0, 'colleagues': 1.9771626925594177, 'in': 0.0, 'united': 0.03125254350410453, 'states': 0.0})
defaultdict(<class 'float'>, {'only': 0.015504186535965254, 'yesterday': 1.8718021769015913, ',': 0.0, 'we': 0.0, 'laid': 1.1298648321722142, 'to': 0.0, '

In [41]:
sentence_tfidfs = [tfIdf(word_idf, nist.tokenize(sentence, lowercase=True)) for sentence in sentences]


In [42]:
import numpy as np

similarity_matrix = np.matrix([[idfCosine(s1, s2) for s2 in sentence_tfidfs] for s1 in sentence_tfidfs])
print(similarity_matrix)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 4.62121200e-02
  2.98009651e-02 2.17263792e-05]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 8.86043468e-06]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  1.90661970e-02 0.00000000e+00]
 ...
 [4.62121200e-02 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  1.77975527e-02 5.58752022e-05]
 [2.98009651e-02 0.00000000e+00 1.90661970e-02 ... 1.77975527e-02
  1.00000000e+00 7.63977721e-02]
 [2.17263792e-05 8.86043468e-06 0.00000000e+00 ... 5.58752022e-05
  7.63977721e-02 1.00000000e+00]]


In [75]:
cosine_threshold = 0.2

# Find all elements where similarity exceeds threshold (excluding the self links)
cross_connectivity_matrix = (similarity_matrix > cosine_threshold) - np.identity(len(sentences))
sentence_degrees = np.sum(cross_connectivity_matrix, axis=0).T
sentence_degrees_list = sentence_degrees.tolist()
sentences_with_score = [(sentences[i], sentence_degrees_list[i]) for i in range(len(sentences))]
sorted_sentences_with_score = sorted(sentences_with_score, key=itemgetter(1), reverse=True)

for sentence_with_score in sorted_sentences_with_score[:20]:
    print(sentence_with_score[0])

At home, Americans will not be less resolute!
Hope has become the secret weapon of the forces of liberation!
We shall need also an abiding faith in the people, the kind of faith and courage which Franklin Delano Roosevelt always had!
Having to pay such a heavy price to make complete victory certain, America will never become a party to any plan for partial victory!
We well know today that such rights can be preserved only by constant vigilance , the eternal price of liberty!
Hope was not enough to beat back the aggressors as long as the peace-loving nations were unwilling to come to each other's defense.
The aggressors were beaten back only when the peace-loving nations united to defend themselves.
Only yesterday, we laid to rest the mortal remains of our beloved President, Franklin Delano Roosevelt.
We must carry on.Our departed leader never looked backward.
So much blood has already been shed for the ideals which we cherish, and for which Franklin Delano Roosevelt lived and died, tha

In [138]:
# Lex Rank logic
cosine_thresholded_similarities = (similarity_matrix > cosine_threshold)
cosine_degrees = np.sum(cosine_thresholded_similarities, axis=0)
degree_averaged_cosine_matrix = cosine_thresholded_similarities/cosine_degrees.T

def powerMethod(matrix, tolerance):
    damping_factor= 0.15
    N = matrix.shape[0]
    p = np.ones((N, 1))
    
    U_kernel = np.ones(matrix.shape)/N
    B_kernel = matrix
    M = damping_factor * U_kernel + (1 - damping_factor)*B_kernel
    
    while True:
        p_new = M.T * p
        error = np.linalg.norm(p_new - p)
        if error < tolerance:
            break
        p = p_new
        
    return p

lexrank_array = powerMethod(degree_averaged_cosine_matrix, 0.001)

In [139]:
scores_array = lexrank_array.tolist()

sentences_with_score = [(sentences[i], lexrank_array[i]) for i in range(len(sentences))]
sorted_sentences_with_score = sorted(sentences_with_score, key=itemgetter(1), reverse=True)

for sentence_with_score in sorted_sentences_with_score[:20]:
    print(sentence_with_score[0])

At home, Americans will not be less resolute!
Hope has become the secret weapon of the forces of liberation!
We must carry on.Our departed leader never looked backward.
We shall need also an abiding faith in the people, the kind of faith and courage which Franklin Delano Roosevelt always had!
Hope was not enough to beat back the aggressors as long as the peace-loving nations were unwilling to come to each other's defense.
The aggressors were beaten back only when the peace-loving nations united to defend themselves.
As I have assumed my heavy duties, I humbly pray Almighty God, in the words of King Solomon:
"Give therefore thy servant an understanding heart to judge thy people, that I may discern between good and bad; for who is able to judge this thy so great a people?"
At this moment, I have in my heart a prayer.
In that way, America may well lead the world to peace and prosperity.
May we Americans all live up to our glorious heritage.
We must learn to trade more with other nations s