In [30]:
links = {
    'webpage-1': set(['webpage-2', 'webpage-4', 'webpage-5', 'webpage-6', 'webpage-8', 'webpage-9', 'webpage-10']),
    'webpage-2': set(['webpage-5', 'webpage-6']),
    'webpage-3': set(['webpage-10']),
    'webpage-4': set(['webpage-9']),
    'webpage-5': set(['webpage-2', 'webpage-4']),
    'webpage-6': set([]), # dangling page
    'webpage-7': set(['webpage-1', 'webpage-3', 'webpage-4']),
    'webpage-8': set(['webpage-1']),
    'webpage-9': set(['webpage-1', 'webpage-2', 'webpage-3', 'webpage-8', 'webpage-10']),
    'webpage-10': set(['webpage-2', 'webpage-3', 'webpage-8', 'webpage-9']),
}
 

In [31]:
def build_index(links):
    website_list = links.keys()
    return {website: index for index, website in enumerate(website_list)}
 
website_index = build_index(links)
print(website_index)
# {'webpage-10': 3, 'webpage-9': 0, 'webpage-8': 1, 'webpage-1': 2, 'webpage-3': 4, 'webpage-2': 5, 'webpage-5': 6, 'webpage-4': 7, 'webpage-7': 8, 'webpage-6': 9}
 

{'webpage-1': 0, 'webpage-2': 1, 'webpage-3': 2, 'webpage-4': 3, 'webpage-5': 4, 'webpage-6': 5, 'webpage-7': 6, 'webpage-8': 7, 'webpage-9': 8, 'webpage-10': 9}


In [32]:
import numpy as np
 
def build_transition_matrix(links, index):
    total_links = 0
    A = np.zeros((len(index), len(index)))
    for webpage in links:
        # dangling page
        if not links[webpage]:
            # Assign equal probabilities to transition to all the other pages
            A[index[webpage]] = np.ones(len(index)) / len(index)
        else:
            for dest_webpage in links[webpage]:
                total_links += 1
                A[index[webpage]][index[dest_webpage]] = 1.0 / len(links[webpage])
 
    return A
 
A = build_transition_matrix(links, website_index)
print(A)
 

[[0.         0.14285714 0.         0.14285714 0.14285714 0.14285714
  0.         0.14285714 0.14285714 0.14285714]
 [0.         0.         0.         0.         0.5        0.5
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]
 [0.         0.5        0.         0.5        0.         0.
  0.         0.         0.         0.        ]
 [0.1        0.1        0.1        0.1        0.1        0.1
  0.1        0.1        0.1        0.1       ]
 [0.33333333 0.         0.33333333 0.33333333 0.         0.
  0.         0.         0.         0.        ]
 [1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.2        0.2        0.2        0.         0.         0.
  0.         0.2        0.         0.2       ]
 [0.         0.25       0.2

In [33]:
A.shape

(10, 10)

In [116]:
def pagerank(A, eps=0.0001, d=0.85):
    P = np.ones(len(A)) / len(A)
    count = 0
    while True:
        new_P = np.ones(len(A)) * (1 - d) / len(A) + d * A.T.dot(P)
        
        vec1 = np.ones(len(A)) * (1 - d) / len(A)
        print('vec1:',vec1)
        
        vec2 = A.T.dot(P)
        print('vec2:',vec2)
        
        vec4 = d * vec2
        print('vec4:',vec4)
        
        vec3 = vec1 + vec4
        print('vec3:',vec3)
        print('new_P:',new_P)
        
        delta = abs(new_P - P).sum()
        print('delta:',delta)
        print('\n')
        if count == 1 :
            break
        count = count+1
        if delta <= eps:
            return new_P
        P = new_P
 
results = pagerank(A)
 
print("Results:", results) # [ 0.13933698,  0.09044235,  0.1300934 ,  0.13148714,  0.08116465, 0.1305122 ,  0.09427366,  0.085402  ,  0.02301397,  0.09427366]
# print(sum(results)) # 1.0
# print([item[0] for item in sorted(enumerate(results), key=lambda item: -item[1])]) # [0, 3, 5, 2, 6, 9, 1, 7, 4, 8]
 

vec1: [0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015]
vec2: [0.16333333 0.11928571 0.08833333 0.10761905 0.07428571 0.07428571
 0.01       0.06928571 0.14928571 0.14428571]
vec4: [0.13883333 0.10139286 0.07508333 0.09147619 0.06314286 0.06314286
 0.0085     0.05889286 0.12689286 0.12264286]
vec3: [0.15383333 0.11639286 0.09008333 0.10647619 0.07814286 0.07814286
 0.0235     0.07389286 0.14189286 0.13764286]
new_P: [0.15383333 0.11639286 0.09008333 0.10647619 0.07814286 0.07814286
 0.0235     0.07389286 0.14189286 0.13764286]
delta: 0.3124761904761905


vec1: [0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015 0.015]
vec2: [0.11791905 0.13165119 0.0784369  0.07669524 0.0879869  0.0879869
 0.00781429 0.09257976 0.17067738 0.14825238]
vec4: [0.10023119 0.11190351 0.06667137 0.06519095 0.07478887 0.07478887
 0.00664214 0.0786928  0.14507577 0.12601452]
vec3: [0.11523119 0.12690351 0.08167137 0.08019095 0.08978887 0.08978887
 0.02164214 0.0936928  0.16007577 0.14101452]
new

In [94]:
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
 
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
#     print('vector1: ', vector1)
#     print('vector2: ', vector2)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
#     print('vector1: ', vector1)
#     print('vector2: ', vector2)
 
    return 1 - cosine_distance(vector1, vector2)
 
# One out of 5 words differ => 0.8 similarity
print(sentence_similarity("This is a good sentence".split(), "This is a bad sentence".split()))
 
# One out of 2 non-stop words differ => 0.5 similarity
print(sentence_similarity("This is a good sentence".split(), "This is a bad sentence".split(), stopwords.words('english')))
 
# 0 out of 2 non-stop words differ => 1 similarity (identical sentences)
print(sentence_similarity("This is a good sentence".split(), "This is a good sentence".split(), stopwords.words('english')))
 
# Completely different sentences=> 0.0
print(sentence_similarity("This is a good sentence".split(), "I want to go to the market".split(), stopwords.words('english')))
 

0.7999999999999998
0.4999999999999999
0.9999999999999998
0.0


In [78]:
def getContent() :
    filepath = '/Users/seeni-2328/Documents/Seeni/Datasets/zoho/connect/sridhar_post.txt'
    F = open(filepath,'r') 
    content = []
    for i in F :
        for j in i.split('. ') :
            j = j.strip().lower()
            if len(j) > 0 :
                content.append(j.split())
#                 print(j)
#     print(content)
    return content
# sentence = getContent()
# print(sentence)

In [117]:
import numpy as np
 
 
# Get a text from the Brown Corpus
# sentences = brown.sents('ca01')
sentences = getContent()
 
# print(sentences)
# [[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.'], [u'The', u'jury', u'further', u'said', u'in', u'term-end', u'presentments', u'that', u'the', u'City', u'Executive', u'Committee', u',', u'which', u'had', u'over-all', u'charge', u'of', u'the', u'election', u',', u'``', u'deserves', u'the', u'praise', u'and', u'thanks', u'of', u'the', u'City', u'of', u'Atlanta', u"''", u'for', u'the', u'manner', u'in', u'which', u'the', u'election', u'was', u'conducted', u'.'], ...]
 
print(len(sentences))  #  98
 
# get the english list of stopwords
stop_words = stopwords.words('english')
 
 
def build_similarity_matrix(sentences, stopwords=None):
    # Create an empty similarity matrix
    S = np.zeros((len(sentences), len(sentences)))
 
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
 
            S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])
#             S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
 
    # normalize the matrix row-wise
    for idx in range(len(S)):
        S[idx] /= S[idx].sum()
 
    return S
 
S = build_similarity_matrix(sentences, stop_words)    
print(S[0])
S.shape

26
[0.         0.07413297 0.         0.05222536 0.06187012 0.07937287
 0.07031055 0.04710984 0.08347556 0.01670336 0.04043617 0.05653181
 0.00570528 0.00746997 0.         0.00570528 0.08068491 0.05653181
 0.06249826 0.         0.03768787 0.04607433 0.05222536 0.
 0.04762372 0.01562457]


(26, 26)

In [118]:
from operator import itemgetter 
 
sentence_ranks = pagerank(S)
 
print(sentence_ranks)
 
# Get the sentences ordered by rank
ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]
print(ranked_sentence_indexes)
 
# Suppose we want the 5 most import sentences
SUMMARY_SIZE = 5
SELECTED_SENTENCES = sorted(ranked_sentence_indexes[:SUMMARY_SIZE])
print(SELECTED_SENTENCES)
 
# Fetch the most important sentences
summary = itemgetter(*SELECTED_SENTENCES)(sentences)
 
# Print the actual summary
for sentence in summary:
    print(' '.join(sentence))

vec1: [0.00576923 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923
 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923
 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923
 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923 0.00576923
 0.00576923 0.00576923]
vec2: [0.04535467 0.05334193 0.00744825 0.0494695  0.05408504 0.05140178
 0.04399183 0.03618195 0.05678306 0.01515823 0.05112926 0.04762827
 0.02409259 0.06446579 0.01671087 0.02858216 0.05173996 0.04936679
 0.04045405 0.01591658 0.03089873 0.043672   0.04193135 0.00151732
 0.06919835 0.00947968]
vec4: [0.03855147 0.04534064 0.00633101 0.04204907 0.04597229 0.04369152
 0.03739305 0.03075466 0.0482656  0.0128845  0.04345987 0.04048403
 0.0204787  0.05479592 0.01420424 0.02429484 0.04397896 0.04196177
 0.03438594 0.0135291  0.02626392 0.0371212  0.03564165 0.00128972
 0.0588186  0.00805773]
vec3: [0.0443207  0.05110987 0.01210024 0.0478183  0.05174152 0.04946075
 0.04316228 0.03652389 0.05403483

TypeError: 'NoneType' object is not iterable

In [8]:
def textrank(sentences, top_n=5, stopwords=None):
    """
    sentences = a list of sentences [[w11, w12, ...], [w21, w22, ...], ...]
    top_n = how may sentences the summary should contain
    stopwords = a list of stopwords
    """
    S = build_similarity_matrix(sentences, stop_words) 
    sentence_ranks = pagerank(S)
 
    # Sort the sentence ranks
    ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]
    selected_sentences = sorted(ranked_sentence_indexes[:top_n])
    summary = itemgetter(*selected_sentences)(sentences)
    return summary
 
for idx, sentence in enumerate(textrank(sentences, stopwords=stopwords.words('english'))):
    print("%s. %s" % ((idx + 1), ' '.join(sentence)))
 
# 1. `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
# 2. Nevertheless , `` we feel that in the future Fulton County should receive some portion of these available funds '' , the jurors said .
# 3. -- After a long , hot controversy , Miller County has a new school superintendent , elected , as a policeman put it , in the `` coolest election I ever saw in this county '' .
# 4. `` This was the coolest , calmest election I ever saw '' , Colquitt Policeman Tom Williams said .
# 5. `` Everything went real smooth '' , the sheriff said .
 

1. `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
2. Nevertheless , `` we feel that in the future Fulton County should receive some portion of these available funds '' , the jurors said .
3. -- After a long , hot controversy , Miller County has a new school superintendent , elected , as a policeman put it , in the `` coolest election I ever saw in this county '' .
4. `` This was the coolest , calmest election I ever saw '' , Colquitt Policeman Tom Williams said .
5. `` Everything went real smooth '' , the sheriff said .
