In [1]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import glob
import math
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances


In [39]:
numbers = re.compile(r'(\d+)')

In [40]:
def numericalSort(value):
    '''
    đọc theo số thứ tự file txt
    '''
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [41]:
def loc_chu_so(url):
    '''
    xóa các số trong văn bản
    '''
    txt = []
    for i in sorted(glob.glob(url),key=numericalSort):  #lọc chữ số
        with open(i) as f:
            lines = f.readlines()
            if len(lines) == 0: lines = " "
            char_str = ['' .join((z for z in lines[0] if not z.isdigit())).lower()]
            txt.extend(char_str)
    return txt

In [42]:
def tien_xu_ly(i):
    '''
    Xóa các stopwords, đưa về từ gốc, tách từ
    '''
    tokenizer = RegexpTokenizer(r'\w+')
    ps = PorterStemmer()
    stop = set(stopwords.words("english"))

    temp = tokenizer.tokenize(i)
    output = [w for w in temp if not w in stop]
    temp_stem=[ps.stem(ps.stem(word)) for word in output]
    #final = [word for word in temp_stem if wordnet.synsets(word)]
  
    final_final = [word for word in temp_stem if not len(word)<3]
    return final_final

In [43]:
def make_docs(txt):
    docs = []
    for i in txt:
        docs.append(tien_xu_ly(i))
    return docs


# Vector Space

## Hàm

In [44]:
def make_term(txt_term):

    '''
    tạo thành các term có dạng là một dict, từ -> số lượng tài liệu chứa nó, tần số lặp từ đó trong tất cả tài liệu. Vd:
    'deepest' : {So_luong_tai_lieu: 1,
                 Tan_so: 2
                }
    '''
    words = {}
    for i in txt_term:
      
        final_final=tien_xu_ly(i)
        
        for word in final_final:
            
            if word not in words: 
                words[word] = {
                "So_luong_tai_lieu" : [txt_term.index(i)+1], 
                "Tan_so" : 1,
                "Posting": None
                }

            else:
                words[word]["Tan_so"]+=1
                current_index = txt_term.index(i)+1
                if current_index not in words[word]["So_luong_tai_lieu"]: 
                    words[word]["So_luong_tai_lieu"].append(current_index)
    for i in words:
        words[i]["So_luong_tai_lieu"] = len(words[i]["So_luong_tai_lieu"])
    return words



In [45]:
def make_detail_term(txt_term):
    '''
    Chi tiết về term đó, từ -> số Id docs: số lần lặp trong doc đó. Vd
    conceived: {
                1016: 1
                1268: 1
    }
    
    '''
    words = {}
    for i in txt_term:
        final_final=tien_xu_ly(i)
        current_index = txt_term.index(i)+1
        for word in final_final:
            if (word not in words):
                words[word] = [[current_index,1,0]]
             
            else:  
                if words[word][-1][0] == current_index: words[word][-1][1] +=1 
                else: words[word].append([current_index,1,0])
            
            #print(list(words[word][0,:]))
       

    return words         


In [46]:
def make_index(term1, detail_term1):
    for word in term1:
        term1[word]["Posting"] = detail_term1[word]
    return term1 
    

In [47]:
def make_weight_for_vector(index,number):
    normal = np.zeros(number)
    for word in index:
        for i in index[word]["Posting"]:
            tf = i[1]
            idf = np.log10((number+1)/index[word]["Tan_so"])
            i[2] = tf*idf
            normal[i[0]-1] += pow((tf*idf),2)

    return index, np.sqrt(normal)

In [48]:
def normal_weight_vector(index,normal):
    for word in index:
        for i in index[word]["Posting"]:
            i[2] /= normal[i[0]-1]
    return index


In [49]:
def vector_space(query,normal_weight,txt,term):
    query = make_term(tien_xu_ly(query))
    
    relevance_scores = {}
    for i in range (0,len(txt)):
        doc_id = i+1
        vector_doc = []
        vector_query = []

        for word in term:
            
            if word not in txt[i]:
                term_doc = 0
            else: 
                index = list(np.array(normal_weight[word]["Posting"])[:,0]).index(doc_id)
                term_doc = normal_weight[word]['Posting'][index][2]

            ######

            if word not in query:
                count = 0
            else: count = query[word]['Tan_so']
            
            vector_query.append(count * np.log10((len(txt)+1)/term[word]["Tan_so"]))
            vector_doc.append(term_doc)


        relevance_scores[doc_id] = cosine_similarity(np.array(vector_query).reshape(1,-1),np.array(vector_doc).reshape(1,-1))
    sorted_value = sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True)
    return sorted_value
    

## Main

In [50]:
txt = loc_chu_so('./Cranfield/*')
docs = make_docs(txt)

In [51]:
term = make_term(txt)
detail_term = make_detail_term(txt)

In [52]:
make_index(term,detail_term)

{'experi': {'So_luong_tai_lieu': 411,
  'Tan_so': 671,
  'Posting': [[1, 3, 0],
   [7, 1, 0],
   [11, 1, 0],
   [12, 1, 0],
   [15, 1, 0],
   [16, 1, 0],
   [17, 2, 0],
   [19, 1, 0],
   [25, 2, 0],
   [29, 1, 0],
   [30, 4, 0],
   [35, 1, 0],
   [37, 1, 0],
   [39, 1, 0],
   [40, 1, 0],
   [41, 3, 0],
   [42, 1, 0],
   [43, 1, 0],
   [47, 1, 0],
   [52, 2, 0],
   [53, 1, 0],
   [58, 1, 0],
   [69, 1, 0],
   [70, 1, 0],
   [74, 1, 0],
   [78, 2, 0],
   [84, 2, 0],
   [99, 2, 0],
   [101, 1, 0],
   [103, 2, 0],
   [109, 2, 0],
   [112, 1, 0],
   [115, 1, 0],
   [121, 1, 0],
   [123, 3, 0],
   [131, 1, 0],
   [137, 1, 0],
   [139, 1, 0],
   [140, 1, 0],
   [142, 1, 0],
   [154, 1, 0],
   [156, 1, 0],
   [165, 1, 0],
   [167, 1, 0],
   [168, 1, 0],
   [170, 4, 0],
   [171, 2, 0],
   [173, 2, 0],
   [175, 1, 0],
   [176, 1, 0],
   [179, 3, 0],
   [182, 1, 0],
   [183, 2, 0],
   [184, 1, 0],
   [186, 3, 0],
   [187, 3, 0],
   [188, 2, 0],
   [189, 1, 0],
   [191, 1, 0],
   [195, 3, 0],
   [

In [81]:
not_normal, normal = make_weight_for_vector(term,len(txt))
normal_weight = normal_weight_vector(not_normal,normal)

In [87]:
vector_space('what are the structural and aeroelastic problems associated with flight of high speed aircraft',normal_weight,docs,term)

[(12, array([[0.43170185]])),
 (746, array([[0.30285514]])),
 (51, array([[0.28965039]])),
 (184, array([[0.27114796]])),
 (884, array([[0.19283036]])),
 (14, array([[0.19165478]])),
 (100, array([[0.18776911]])),
 (1169, array([[0.1821953]])),
 (875, array([[0.16691922]])),
 (141, array([[0.16592846]])),
 (862, array([[0.16589393]])),
 (253, array([[0.15506816]])),
 (781, array([[0.14871988]])),
 (284, array([[0.14587222]])),
 (579, array([[0.1439533]])),
 (1331, array([[0.14114391]])),
 (1361, array([[0.1409666]])),
 (47, array([[0.13404147]])),
 (810, array([[0.1338951]])),
 (497, array([[0.12954381]])),
 (725, array([[0.12942913]])),
 (1263, array([[0.12921865]])),
 (606, array([[0.12740996]])),
 (1163, array([[0.12599869]])),
 (726, array([[0.12528027]])),
 (78, array([[0.121985]])),
 (712, array([[0.11855274]])),
 (1158, array([[0.11509797]])),
 (909, array([[0.1103988]])),
 (792, array([[0.10833938]])),
 (202, array([[0.10720483]])),
 (1089, array([[0.10637781]])),
 (288, array(

# BIM

## Hàm

In [69]:
def make_term_bim(txt_term):

    '''
    tạo thành các term có dạng là một dict, từ -> số lượng tài liệu chứa nó, tần số lặp từ đó trong tất cả tài liệu. Vd:
    'deepest' : {So_luong_tai_lieu: 1,
                 Tan_so: 2
                }
    '''
    words = {}
    for i in txt_term:
      
        final_final=tien_xu_ly(i)
        
        for word in final_final:
            
            if word not in words: 
                words[word] = {
                "So_luong_tai_lieu" : [txt_term.index(i)+1], 
                "Tan_so" : 1,
                "Posting": None,
                "Weight" : None
                }

            else:
                words[word]["Tan_so"]+=1
                current_index = txt_term.index(i)+1
                if current_index not in words[word]["So_luong_tai_lieu"]: 
                    words[word]["So_luong_tai_lieu"].append(current_index)
    for i in words:
        words[i]["So_luong_tai_lieu"] = len(words[i]["So_luong_tai_lieu"])
    return words

In [70]:
def make_detail_term(txt_term):
    '''
    Chi tiết về term đó, từ -> số Id docs: số lần lặp trong doc đó. Vd
    conceived: {
                1016: 1
                1268: 1
    }
    
    '''
    words = {}
    for i in txt_term:
        final_final=tien_xu_ly(i)
        current_index = txt_term.index(i)+1
        for word in final_final:
            if (word not in words):
                words[word] = [[current_index,1,0]]
             
            else:  
                if words[word][-1][0] == current_index: words[word][-1][1] +=1 
                else: words[word].append([current_index,1,0])
            
            #print(list(words[word][0,:]))
       

    return words    

In [83]:
def relevant_docs(query,term):
    docs_id = []
    
    for word in query:
        docs_id.append(id for id in np.array(term[word]["Posting"])[:,0] if id not in docs_id)
    return docs_id

In [125]:
def bim1(term,txt):
    '''
    trường hợp 1 chưa có thông tin
    '''
    weights = {}

    for word in term:
        p = 0.5
        p_= term[word]['So_luong_tai_lieu']/(len(txt))
        #w = np.log2(p/p_)
        w = p/p_
        weights[word] = w


    return weights

In [84]:
def bim2(term,txt,query):
        '''
        trường hợp 2
        '''
        weights = {}
        relevant = relevant_docs(query,term)
        Nr = len(relevant)
        for word in term:
            rtd = 0
            for id in np.array(term[word]["Posting"])[:,0]:
                 if id in relevant: rtd+=1
            Ntd = term[word]['So_luong_tai_lieu']
            p = (rtd + 0.5) / (Nr + 1)
            p_= (Ntd-rtd+0.5)/(len(txt)-Nr+1)
            #w = math.log(p/p_)
            w = p/p_

            weights[word] = w
        
    
        return weights

In [71]:
def make_index(term1, detail_term1,weight):
    for word in term1:
        term1[word]["Posting"] = detail_term1[word]
        term1[word]["Weight"] = weight[word]
    return term1 

In [130]:
def relevant_bim(term,query,txt):
    relevance_scores = {}
    doc_id = 0
    query = make_term(tien_xu_ly(query))
    for i in range (0,len(txt)):
        doc_id = i+1
        score = -1
        for word in query:
            if word in txt[i]:
                if score == -1: score = 1
                score *= term[word]["Weight"]
              

        relevance_scores[doc_id] = score
    sorted_value = sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True)
    return sorted_value

## Main

### Trường hợp 1

In [129]:
txt = loc_chu_so('./Cranfield/*')
docs = make_docs(txt)
term = make_term(txt)
detail_term = make_detail_term(txt)
weight = bim1(term,txt)
make_index(term,detail_term,weight)

{'experi': {'So_luong_tai_lieu': 411,
  'Tan_so': 671,
  'Weight': 1.70316301703163,
  'Posting': [[1, 3, 0],
   [7, 1, 0],
   [11, 1, 0],
   [12, 1, 0],
   [15, 1, 0],
   [16, 1, 0],
   [17, 2, 0],
   [19, 1, 0],
   [25, 2, 0],
   [29, 1, 0],
   [30, 4, 0],
   [35, 1, 0],
   [37, 1, 0],
   [39, 1, 0],
   [40, 1, 0],
   [41, 3, 0],
   [42, 1, 0],
   [43, 1, 0],
   [47, 1, 0],
   [52, 2, 0],
   [53, 1, 0],
   [58, 1, 0],
   [69, 1, 0],
   [70, 1, 0],
   [74, 1, 0],
   [78, 2, 0],
   [84, 2, 0],
   [99, 2, 0],
   [101, 1, 0],
   [103, 2, 0],
   [109, 2, 0],
   [112, 1, 0],
   [115, 1, 0],
   [121, 1, 0],
   [123, 3, 0],
   [131, 1, 0],
   [137, 1, 0],
   [139, 1, 0],
   [140, 1, 0],
   [142, 1, 0],
   [154, 1, 0],
   [156, 1, 0],
   [165, 1, 0],
   [167, 1, 0],
   [168, 1, 0],
   [170, 4, 0],
   [171, 2, 0],
   [173, 2, 0],
   [175, 1, 0],
   [176, 1, 0],
   [179, 3, 0],
   [182, 1, 0],
   [183, 2, 0],
   [184, 1, 0],
   [186, 3, 0],
   [187, 3, 0],
   [188, 2, 0],
   [189, 1, 0],
   [19

In [131]:
relevant_bim(term,'what are the structural and aeroelastic problems associated with flight of high speed aircraft',txt)

[(12, 200419.74413139475),
 (14, 30635.589460084622),
 (172, 8109.420739434165),
 (78, 6013.048088957651),
 (1380, 5153.650563378721),
 (486, 3920.45560714167),
 (1089, 3382.7869370211088),
 (746, 3107.324073808583),
 (184, 2508.3000599651914),
 (202, 2508.3000599651914),
 (51, 1737.5164756533975),
 (364, 1563.9597140337319),
 (141, 1433.8081083431032),
 (810, 1140.4824530528308),
 (658, 993.9183229373248),
 (729, 991.983074562505),
 (798, 822.5269607140366),
 (792, 787.7723004021759),
 (29, 663.9617805790213),
 (100, 663.9617805790213),
 (836, 663.9617805790213),
 (781, 609.894877594276),
 (284, 551.3600751007077),
 (1361, 551.3600751007077),
 (1263, 522.7274142855559),
 (195, 458.62231186965124),
 (747, 458.62231186965124),
 (724, 414.60592899671263),
 (908, 414.60592899671263),
 (416, 363.5006471855754),
 (1300, 363.5006471855754),
 (1309, 349.8759541835675),
 (625, 343.11124646928386),
 (1170, 334.4400079953588),
 (374, 328.6135881677648),
 (685, 276.5201351804556),
 (1169, 265.591

## Trường hợp 2

In [85]:
term = make_term_bim(example)
detail_term = make_detail_term(example)
weight = bim2(term,['today','big'],example)
make_index(term,detail_term,weight)
relevant_bim(term,['today','big'],example)

KeyError: 'today big'

# Test với thư viện

In [None]:
D = text

In [497]:
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import StandardAnalyzer

schema = Schema(docid=STORED(), content=TEXT(stored=True, analyzer=StandardAnalyzer()))
ix = create_in("ind", schema)
writer = ix.writer()
for i in range(len(D)):
  writer.add_document(docid="{}".format(i+1), content=D[i])
writer.commit()


In [86]:
from whoosh import qparser
from whoosh import scoring
import whoosh.index as index

ind = index.open_dir("ind")
searcher = ind.searcher(weighting=scoring.TF_IDF())

parser = qparser.QueryParser("content", ind.schema, group=qparser.OrGroup)
query = parser.parse('what are the structural and aeroelastic problems associated with flight of high speed aircraft')
results = searcher.search(query, limit=None)
for i in range(len(results)):
  print(results[i]['docid'])

12
51
792
1169
14
1147
588
746
100
1263
810
156
712
724
172
1144
884
579
606
92
1163
1170
114
576
82
77
253
1341
876
804
1042
453
141
184
1089
364
1379
416
798
658
578
47
883
805
806
193
729
726
78
1392
574
33
811
1168
329
781
700
624
1167
429
288
52
908
36
1157
593
833
721
747
69
1380
1268
315
1095
262
456
914
1246
368
1158
878
846
1295
101
1320
1303
1338
1385
29
211
216
252
202
1165
1166
204
280
430
435
1063
486
731
1217
1309
373
695
42
1072
896
76
607
997
1375
311
1015
374
1300
875
275
812
968
1065
1002
58
187
441
1294
195
209
328
345
1051
350
422
425
542
641
648
733
777
921
1201
244
266
85
1299
976
1271
1356
251
297
722
756
406
540
672
24
198
263
293
874
321
476
1043
1087
962
1088
1111
1244
108
285
552
28
1361
75
214
302
309
378
481
493
675
799
982
991
213
316
687
717
1226
236
1134
1248
595
844
1097
836
415
497
725
1197
1328
650
840
640
163
625
917
948
554
1013
220
70
699
189
466
638
701
702
1297
1347
1012
25
162
431
516
715
758
813
969
1000
1162
1270
1291
1331
395
870
899
11
97
18