-
Notifications
You must be signed in to change notification settings - Fork 2
/
LSA_summary.py
120 lines (95 loc) · 3.15 KB
/
LSA_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy, scipy.sparse
from sparsesvd import sparsesvd
import sys
import re
from collections import Counter
def tokenize():
data=sys.stdin.read()
sentence_dict={}
pattern='.*?\.\s*[A-Z]'
p=re.compile(pattern)
for m in p.finditer(data):
if m.start()!=0 and m.end()!=len(data):
sentence_dict[len(sentence_dict)]=data[m.start()-1:m.end()-1]
elif m.start()==0 and m.end()!=len(data):
sentence_dict[len(sentence_dict)]=(data[m.start():m.end()-1])
elif m.start==0 and m.end()==len(data):
sentence_dict[len(sentence_dict)]=(data)
else:
sentence_dict[len(sentence_dict)]=(data[m.start()-1:m.end()])
sentence_dict[len(sentence_dict)]=data[m.end()-1:]
Word_dict={}
for sen in sentence_dict:
words=sentence_dict[sen].split()
for word in words:
word=word.lower()
if word not in Word_dict:
Word_dict[word]=len(Word_dict)
return [sentence_dict,Word_dict]
def Compute_TF_IDF_matrix_of_sentence(Sentence_dict,Word_dict):
M = numpy.zeros([len(Word_dict),len(Sentence_dict)])
for sen in Sentence_dict:
words=Sentence_dict[sen].split()
words=[word.lower() for word in words]
freqs=Counter(words)
for key in freqs:
M[Word_dict[key]][sen]=freqs[key]
##using just frequency of word in sentence as IDF is not available
##also one can weight each individual word based on if it is noun or verb etc.
return M
def heapify(L,index,k):
while(2*index+1<k):
t=1
if(L[index][0]<L[2*index+1][0]):
if 2*index+2>k or L[index][0]<L[2*index+2][0]:
return L
else:
t=2
else:
if (t==1 and (2*index+2>=k) or L[2*index+1][0]<L[2*index+2][0]):
t=1
else:
t=2
temp=L[index]
L[index]=L[2*index+t]
L[2*index+t]=temp
index=2*index+t
return L
def LSA(M,k): ##will return top k sentences
SM = scipy.sparse.csc_matrix(M) # convert to sparse CSC format
u, s, vt = sparsesvd(SM,k+10) #
##SVD calculated at this stage, concept matrix vt, from now we can apply various approaches
##to filter out top k sentences.
##We are using OzSoy's approach
##Using Cross Method
m,n=M.shape
Avg=numpy.average(M,1)
for i in range(0,m):
for j in range(0,n):
if M[i][j]<Avg[i]:
M[i][j]=0
Length=numpy.dot(s,vt)
L=[]
##returning top k sentences
for i in range(0,n):
L.append(tuple([Length[i],i]))
if k>=len(L):
return L
#building min heap
count= int(k/2-1)
while(count>=0):
L=heapify(L,count,k)
count-=1
for i in range(k,len(L)):
if L[0][0]<L[i][0]:
L[0]=L[i]
L=heapify(L,0,k)
return L[:k]
if __name__=="__main__":
[Sentence_dict,Word_dict]=tokenize()
M=Compute_TF_IDF_matrix_of_sentence(Sentence_dict,Word_dict)
L = LSA(M,5)
L=sorted(L,key=lambda s : s[1])
print(L)
for i in L:
print(str(i[1])+':::'+str(Sentence_dict[i[1]]).strip())