## Indexing & Searching

In [1]:
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

### koleksi satu (sample = 5)

In [2]:
import xml.dom.minidom as minidom

In [3]:
doc_xml_sample = minidom.parse("data.xml")

In [7]:
all_doc_no = doc_xml_sample.getElementsByTagName('sym')
all_text = doc_xml_sample.getElementsByTagName('disease')
N_DOC_sample = len(all_doc_no)

In [8]:
N_DOC_sample

200869

In [9]:
all_sentence_doc_sample = []
for i in range(N_DOC_sample):
    sentence_doc_sample = ' '+ all_text[i].firstChild.data
    all_sentence_doc_sample.append(sentence_doc_sample)

In [10]:
all_sentence_doc_sample

[' Del (2) (p22-p21)',
 ' Del (2) (p22-p21)',
 ' Del (2) (p22-p21)',
 ' Del (2) (p22-p21)',
 ' Del (2) (p22-p21)',
 ' 2q22-q24 deletion',
 ' 2q22-q24 deletion',
 ' 2q22-q24 deletion',
 ' 2q22-q24 deletion',
 ' 2q22-q24 deletion',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Ablinism i syndrome',
 ' Hunter-macpherson syndrome',
 ' Hunter-macpherson syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Ventricular familial preexcitation syndrome',
 ' Abdominal cramps',
 ' Abdominal chemodectomas with cutaneous a

In [9]:
token_doc_sample=[]

In [10]:
def remove_punc_tokenize(sentence_sample):
    tokens_sample = []
    for punctuation in string.punctuation:
        sentence_sample = sentence_sample.replace(punctuation," ")
    
    sentence_sample = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence_sample, flags=re.MULTILINE)
    for w in CountVectorizer().build_tokenizer()(sentence_sample):
        tokens_sample.append(w)
    return tokens_sample

In [11]:
for i in range(N_DOC_sample):
    token_doc_sample.append(remove_punc_tokenize(all_sentence_doc_sample[i]))

In [12]:
token_doc_sample

[['He', 'likes', 'to', 'wink', 'he', 'likes', 'to', 'drink'],
 ['He', 'likes', 'to', 'drink', 'and', 'drink', 'and', 'drink'],
 ['The', 'thing', 'he', 'likes', 'to', 'drink', 'is', 'ink'],
 ['The', 'ink', 'he', 'likes', 'to', 'drink', 'is', 'pink'],
 ['He', 'likes', 'to', 'wink', 'and', 'drink', 'pink', 'ink']]

## Case Folding 

In [29]:
#mengubah term kebentuk lower case
def to_lower(tokens_sample):
    tokens_sample = [x.lower() for x in tokens_sample]
    return tokens_sample

In [30]:
for i in range(N_DOC_sample):
    token_doc_sample[i] = to_lower(token_doc_sample[i])

In [31]:
all_tokens_sample = []
for i in range(N_DOC_sample):
    for w in token_doc_sample[i]:
        all_tokens_sample.append(w)

new_sentence_sample = ' '.join([w for w in all_tokens_sample])

for w in CountVectorizer().build_tokenizer()(new_sentence_sample):
    all_tokens_sample.append(w)

## Stopping 

In [16]:
#stopping merupakan preprocessing untuk menghapus penggunaan tanda baca di dalam artikel
from nltk.corpus import stopwords
stop_words_sample = set(stopwords.words('english'))
def stop_word_token(tokens):
    tokens = [w for w in tokens if not w in stop_words_sample]
    return tokens

for i in range(N_DOC_sample):
    token_doc_sample[i] = stop_word_token(token_doc_sample[i])

In [17]:
for i in range(N_DOC_sample):
    token_doc_sample[i] = ([w for w in token_doc_sample[i] if not any(j.isdigit() for j in w)])

## Normalization

In [32]:
#mengubah term ke bentuk kata baku.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
    for i in range(0, len(tokens)):
        if (tokens[i] != stemmer.stem(tokens[i])):
            tokens[i] = stemmer.stem(tokens[i])
    return tokens


for i in range(N_DOC_sample):
    token_doc_sample[i] = stemming(token_doc_sample[i])

In [33]:
token_doc_sample

[['like', 'wink', 'like', 'drink'],
 ['like', 'drink', 'drink', 'drink'],
 ['thing', 'like', 'drink', 'ink'],
 ['ink', 'like', 'drink', 'pink'],
 ['like', 'wink', 'drink', 'pink', 'ink']]

In [34]:
all_tokens = []
for i in range(N_DOC_sample):
    for w in token_doc_sample[i]:
        all_tokens.append(w)

new_sentence = ' '.join([w for w in all_tokens])

for w in CountVectorizer().build_tokenizer()(new_sentence):
    all_tokens.append(w)

In [35]:
all_tokens = set(all_tokens)

Implementasi Indexing dengan preprocessing Standar

#### Proximity Index

In [36]:
from itertools import count
try: 
    from itertools import izip as zip
except ImportError:
    pass
proximity_index = {}
for token in all_tokens:
    dict_doc_position = {}
    for n in range(N_DOC_sample):
        if(token in token_doc_sample[n]):
            dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), token_doc_sample[n]) if j == token]
    proximity_index[token] = dict_doc_position

In [37]:
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
    print (key, value)

drink {'1': [4], '2': [2, 3, 4], '3': [3], '4': [3], '5': [3]}
ink {'3': [4], '4': [1], '5': [5]}
like {'1': [1, 3], '2': [1], '3': [2], '4': [2], '5': [1]}
pink {'4': [4], '5': [4]}
thing {'3': [1]}
wink {'1': [2], '5': [2]}


In [38]:
file = open('hasil indexing setelah preprocessing standar_sample.txt','w')
for key, value in proximity_index.items():
    file.write(key+'\n')
    for key, value in value.items():
        file.write('\t'+str(key)+': ')
        for i in range (len(value)):
            file.write(str(value[i]))
            if not(i == len(value)-1):
                file.write(',')
        file.write('\n')
    file.write('\n')
file.close()

In [39]:
for key,value in enumerate (proximity_index.items(),1):
    print (key)

1
2
3
4
5
6


In [40]:
try:
    from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
    try:
        from itertools import izip as zip # < 2.5 or 3.x
    except ImportError:
        pass

In [41]:
import itertools
zip = getattr(itertools, 'izip', zip)

### implementasi Searching

In [44]:
collection_matrix = {}
for token in all_tokens:
    matrix = []
    for i in range(N_DOC_sample):
        if(token in token_doc_sample[i]):
            matrix.append('1')
        else:
            matrix.append('0')
    collection_matrix[token] = matrix

### koleksi 2 : trec.sample = 1000

In [45]:
import xml.dom.minidom as minidom

In [93]:
doc_xml = minidom.parse("trec.sample.xml")

In [94]:
all_doc_no = doc_xml.getElementsByTagName('DOCNO')
all_headline = doc_xml.getElementsByTagName('HEADLINE')
all_text = doc_xml.getElementsByTagName('TEXT')
N_DOC = len(all_doc_no)

In [95]:
N_DOC

1000

In [96]:
all_sentence_doc = []
for i in range(N_DOC):
    sentence_doc = all_headline[i].firstChild.data +' '+ all_text[i].firstChild.data
    all_sentence_doc.append(sentence_doc)

In [97]:
all_sentence_doc[0]

"\nFT  14 MAY 91 / (CORRECTED) Jubilee of a jet that did what it was designed\nto do\n \nCorrection (published 16th May 1991) appended to this article.\n'FRANK, it flies]' shouted someone at Sir Frank Whittle during the maiden\nflight of a British jet. 'Of course it does,' replied Sir Frank, who\npatented the first aircraft gas turbine. 'That's what it was bloody well\ndesigned to do, wasn't it?'\nExactly 50 years ago yesterday, the first British jet made a brief 17-minute\nflight from RAF Cranwell in Lincolnshire. To celebrate the event, Mr Eric\n'Winkle' Brown, a 72-year-old test pilot of the prototype Gloster Whittle\njet, Mr Geoffrey Bone, a 73-year-old engineer, and Mr Charles McClure, a\n75-year-old pilot, returned to RAF Cranwell. They are seen in front of a\nrestored Meteor NF 11. Sir Frank was unable to attend because of ill-health.\nThe Gloster Whittle was not the first jet to fly: a Heinkel 178 had its\nmaiden flight in August 1939, 21 months before the British aircraft.\nCo

In [98]:
tokens_doc = []

###### Remove Punctuation, URL & Tokenize

In [99]:
def remove_punc_tokenize(sentence):
    tokens = []
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation," ")
    
    sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
    for w in CountVectorizer().build_tokenizer()(sentence):
        tokens.append(w)
    return tokens

In [100]:
for i in range(N_DOC):
    tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))

In [101]:
tokens_doc[0]

['FT',
 '14',
 'MAY',
 '91',
 'CORRECTED',
 'Jubilee',
 'of',
 'jet',
 'that',
 'did',
 'what',
 'it',
 'was',
 'designed',
 'to',
 'do',
 'Correction',
 'published',
 '16th',
 'May',
 '1991',
 'appended',
 'to',
 'this',
 'article',
 'FRANK',
 'it',
 'flies',
 'shouted',
 'someone',
 'at',
 'Sir',
 'Frank',
 'Whittle',
 'during',
 'the',
 'maiden',
 'flight',
 'of',
 'British',
 'jet',
 'Of',
 'course',
 'it',
 'does',
 'replied',
 'Sir',
 'Frank',
 'who',
 'patented',
 'the',
 'first',
 'aircraft',
 'gas',
 'turbine',
 'That',
 'what',
 'it',
 'was',
 'bloody',
 'well',
 'designed',
 'to',
 'do',
 'wasn',
 'it',
 'Exactly',
 '50',
 'years',
 'ago',
 'yesterday',
 'the',
 'first',
 'British',
 'jet',
 'made',
 'brief',
 '17',
 'minute',
 'flight',
 'from',
 'RAF',
 'Cranwell',
 'in',
 'Lincolnshire',
 'To',
 'celebrate',
 'the',
 'event',
 'Mr',
 'Eric',
 'Winkle',
 'Brown',
 '72',
 'year',
 'old',
 'test',
 'pilot',
 'of',
 'the',
 'prototype',
 'Gloster',
 'Whittle',
 'jet',
 'Mr',


##### Case Folding to lower case

Case folding adalah tahapan mengubah term menjadi lower case.

In [102]:
def to_lower(tokens):
    tokens = [x.lower() for x in tokens]
    return tokens

In [103]:
for i in range(N_DOC):
    tokens_doc[i] = to_lower(tokens_doc[i])

##### Stopping, remove number & Stemming

Stopping dan remove number merupakan tahapan untuk menghapus term yang mengandung angka dan tanda baca

In [104]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

for i in range(N_DOC):
    tokens_doc[i] = stop_word_token(tokens_doc[i])

In [105]:
for i in range(N_DOC):
    tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])

### normalization

Stemming mengubah term ke bentuk kata dasar

In [106]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
    for i in range(0, len(tokens)):
        if (tokens[i] != stemmer.stem(tokens[i])):
            tokens[i] = stemmer.stem(tokens[i])
    return tokens


for i in range(N_DOC):
    tokens_doc[i] = stemming(tokens_doc[i])

In [107]:
tokens_doc[0]

['ft',
 'may',
 'correct',
 'jubile',
 'jet',
 'design',
 'correct',
 'publish',
 'may',
 'append',
 'articl',
 'frank',
 'fli',
 'shout',
 'someon',
 'sir',
 'frank',
 'whittl',
 'maiden',
 'flight',
 'british',
 'jet',
 'cours',
 'repli',
 'sir',
 'frank',
 'patent',
 'first',
 'aircraft',
 'ga',
 'turbin',
 'bloodi',
 'well',
 'design',
 'exactli',
 'year',
 'ago',
 'yesterday',
 'first',
 'british',
 'jet',
 'made',
 'brief',
 'minut',
 'flight',
 'raf',
 'cranwel',
 'lincolnshir',
 'celebr',
 'event',
 'mr',
 'eric',
 'winkl',
 'brown',
 'year',
 'old',
 'test',
 'pilot',
 'prototyp',
 'gloster',
 'whittl',
 'jet',
 'mr',
 'geoffrey',
 'bone',
 'year',
 'old',
 'engin',
 'mr',
 'charl',
 'mcclure',
 'year',
 'old',
 'pilot',
 'return',
 'raf',
 'cranwel',
 'seen',
 'front',
 'restor',
 'meteor',
 'nf',
 'sir',
 'frank',
 'unabl',
 'attend',
 'ill',
 'health',
 'gloster',
 'whittl',
 'first',
 'jet',
 'fli',
 'heinkel',
 'maiden',
 'flight',
 'august',
 'month',
 'british',
 'aircr

In [108]:
all_tokens =[]
for i in range(N_DOC):
    for j in tokens_doc[i]:
        all_tokens.append(j)
        
new_sentences = ' '.join([w for w in all_tokens])

for j in CountVectorizer().build_tokenizer()(new_sentences):
    all_tokens.append(j)

#### remove duplicate

In [109]:
all_tokens = set(all_tokens)

##### Proximity index

In [110]:
from itertools import count
try: 
    from itertools import izip as zip
except ImportError:
    pass
proximity_index = {}
for token in all_tokens:
    dict_doc_position = {}
    for n in range(N_DOC):
        if(token in tokens_doc[n]):
            dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
    proximity_index[token] = dict_doc_position

In [111]:
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
    print (key, value)

aa {'245': [53]}
aaf {'351': [835, 1299]}
ab {'19': [46]}
abalkin {'112': [114]}
abandon {'12': [153], '31': [97], '48': [162], '74': [8], '140': [645], '151': [108], '223': [151], '307': [357], '323': [212], '354': [93], '3363': [86], '3381': [144], '3407': [544], '3410': [83], '3448': [538], '3459': [423], '3504': [196], '3597': [103], '3637': [226], '3718': [74], '3783': [184], '3867': [324], '3932': [305]}
abat {'3926': [217]}
abb {'129': [175, 372, 383, 404, 420, 437, 484, 489, 511, 621, 661]}
abbado {'263': [155]}
abbott {'17': [70, 130, 168], '348': [78], '351': [1055], '3693': [947]}
abbrevi {'3442': [61]}
abc {'3337': [134], '3708': [8, 24, 47, 96], '3818': [9, 22, 47, 100, 125, 152, 193]}
abci {'3369': [582]}
abduct {'272': [160]}
abdul {'261': [49]}
abel {'264': [377]}
abela {'3401': [236]}
aberdeen {'141': [904], '3327': [105], '3330': [44, 65], '3828': [102]}
abhor {'48': [144]}
abid {'3407': [194], '3643': [362]}
abidin {'3939': [44]}
abil {'30': [102], '57': [50], '59': 

atr {'246': [44, 70]}
atrium {'342': [80], '350': [113]}
atroc {'3543': [90]}
attach {'139': [49], '198': [218], '242': [85], '307': [102], '309': [135], '337': [147], '352': [111, 141], '3366': [95], '3405': [59], '3407': [327], '3432': [302], '3645': [514], '3870': [98], '3889': [94], '3917': [396, 428], '3920': [24]}
attack {'40': [184], '57': [94], '87': [21], '90': [66], '95': [20, 58], '98': [107], '111': [20], '117': [28], '127': [209], '157': [8], '215': [5, 18], '225': [59, 183], '266': [337], '271': [78], '272': [6, 131], '286': [6, 13, 36, 192, 208], '3328': [24], '3329': [43, 65], '3364': [54], '3383': [174], '3396': [54, 85], '3405': [489], '3432': [78], '3456': [18], '3461': [26], '3493': [256], '3503': [59, 1057], '3507': [147], '3534': [84, 131], '3545': [295], '3561': [71, 301], '3562': [154], '3641': [265, 442], '3655': [110, 134], '3678': [6, 10], '3680': [5, 41], '3686': [24], '3691': [158], '3699': [393], '3726': [256, 389], '3737': [28], '3750': [4], '3760': [6], 

brackman {'3364': [49]}
bradburi {'3792': [70]}
bradford {'26': [60], '64': [70], '155': [58, 101], '274': [136, 236, 241], '351': [430, 754], '368': [165]}
bradi {'133': [617]}
bradley {'351': [218]}
bradshaw {'59': [14]}
bradstreet {'3459': [744], '3693': [740]}
braemar {'189': [153]}
brage {'3793': [71]}
brahm {'263': [170]}
brahmin {'218': [316]}
brain {'135': [839], '368': [42], '3375': [30, 68, 115]}
braithwait {'86': [181]}
brake {'3354': [158], '3403': [451], '3693': [860], '3771': [357]}
branch {'21': [487], '78': [208, 218], '126': [78], '134': [296], '199': [87], '348': [139], '372': [277, 283], '3336': [96, 142], '3337': [115], '3396': [96], '3437': [277], '3444': [489], '3494': [180], '3534': [97, 154], '3641': [279], '3715': [191], '3723': [73], '3776': [103], '3781': [104, 127, 154, 377], '3783': [314, 335, 345, 353], '3791': [127, 132, 151, 199, 279, 331, 417], '3792': [228, 494, 499, 506], '3819': [50, 59, 81], '3869': [94], '3876': [186]}
brand {'186': [217, 233, 280,

cinderella {'3871': [48]}
cindi {'3894': [95]}
cinema {'55': [141, 281, 285, 306], '247': [151], '3366': [6, 480]}
cinquecento {'3660': [47, 78, 107, 180]}
cir {'3569': [240], '3695': [297], '3819': [144]}
circl {'88': [27], '214': [263], '262': [64], '3449': [111], '3727': [310], '3734': [6]}
circuit {'103': [182], '130': [475, 479], '136': [373, 375, 419], '139': [354], '201': [143, 256], '367': [169], '3756': [134], '3916': [64]}
circuitri {'3916': [329]}
circul {'84': [23], '115': [115], '135': [749], '148': [166], '3369': [181, 204, 365, 529, 534, 537], '3397': [136], '3438': [217], '3445': [184], '3571': [242], '3701': [230], '3799': [200], '3911': [104], '3946': [119]}
circular {'94': [34], '140': [322], '350': [143]}
circumscrib {'65': [360]}
circumspect {'327': [502]}
circumst {'2': [33], '49': [164], '132': [505], '252': [271], '308': [488], '3331': [145], '3532': [72, 137], '3543': [293], '3549': [77], '3646': [83], '3654': [105, 235], '3745': [179], '3793': [172], '3857': [

cutter {'207': [134], '327': [231]}
cuyp {'3640': [42]}
cvm {'3805': [165, 184]}
cvr {'154': [27, 55, 88, 92, 111, 139, 162, 198, 224, 277, 308, 317, 354]}
cw {'3333': [10, 110, 141, 144, 161]}
cyanamid {'3459': [732], '3693': [724]}
cycl {'59': [428], '103': [148], '135': [636], '136': [265], '137': [587], '139': [477], '154': [52], '341': [462], '3346': [122], '3354': [203], '3361': [119], '3411': [165], '3414': [304], '3415': [246], '3417': [396, 468], '3487': [100], '3503': [287], '3506': [44], '3625': [190], '3654': [324], '3800': [68], '3849': [89], '3852': [117], '3906': [44], '3908': [564], '3915': [312]}
cyclic {'101': [343], '129': [206], '169': [622], '221': [89], '248': [98], '328': [401]}
cyclist {'3368': [27, 158, 202]}
cyclon {'124': [33]}
cymru {'125': [229], '3524': [164]}
cynic {'354': [550]}
cypru {'3408': [245, 289, 300, 314, 531, 568], '3421': [5, 74, 156]}
cyrpu {'3421': [186]}
czarist {'3637': [165]}
czech {'339': [16, 25], '3418': [430], '3856': [432], '3902': [

drunk {'3400': [109]}
drunkenli {'3637': [229]}
dsm {'3386': [10, 49]}
dt {'3911': [415]}
dti {'88': [136, 201], '133': [823, 847], '251': [113, 302], '309': [77, 296, 427], '3645': [478, 495]}
dtp {'3911': [12, 83, 154, 206, 208, 271, 277, 301, 318, 403], '3923': [226]}
du {'31': [170, 174], '194': [7, 15, 80], '3442': [251]}
dual {'210': [182], '225': [144], '3358': [70], '3450': [294], '3484': [245], '3693': [997], '3913': [299]}
dub {'3395': [240], '3561': [101], '3645': [136]}
dubai {'103': [10, 38, 89, 176, 242], '3739': [117]}
dubarri {'194': [245]}
dubiou {'169': [605], '192': [29], '198': [506], '265': [63], '309': [451], '3362': [155], '3792': [292]}
dublin {'32': [14], '57': [73], '85': [151, 157, 172], '249': [152, 171], '369': [142], '3550': [29], '3837': [23], '3940': [19]}
duce {'30': [13]}
duck {'368': [318], '3527': [6], '3757': [133], '3901': [59]}
due {'21': [31, 213, 279, 407], '25': [16], '28': [111], '31': [270], '32': [64], '55': [757], '76': [121, 150], '88': [1

felt {'58': [163], '65': [340], '101': [12], '111': [84], '196': [163], '214': [255], '272': [245, 399], '311': [175, 188, 601], '374': [138], '3342': [53], '3370': [656], '3409': [139], '3416': [220], '3459': [400], '3462': [107], '3479': [67], '3498': [20], '3535': [244], '3618': [42], '3639': [516, 575], '3728': [410], '3744': [140], '3786': [445], '3922': [274], '3923': [96], '3933': [71], '3939': [331]}
femal {'207': [15, 29, 59, 130], '3409': [331], '3886': [73]}
fen {'3439': [437, 441]}
fenc {'3724': [413]}
fenchurch {'351': [433]}
fenosa {'3602': [104, 172, 264, 362]}
ferdinand {'3796': [387]}
fernandez {'3374': [125]}
fernando {'355': [14]}
feroci {'56': [212]}
ferranti {'371': [185]}
ferri {'301': [18], '3365': [61]}
ferruzzi {'3603': [15]}
fertil {'3434': [186], '3436': [353]}
fertilis {'16': [322], '3439': [391]}
fervour {'3906': [240]}
festiv {'58': [8, 57], '194': [145], '196': [8, 57, 437], '263': [79], '265': [133, 413], '267': [6, 11, 42], '270': [38], '337': [7, 19], 

glossi {'3369': [623], '3395': [93], '3412': [570], '3871': [39]}
gloster {'1': [60, 89, 105]}
gloucest {'3578': [12]}
glove {'3415': [53]}
glow {'262': [384], '3368': [30], '3640': [331]}
gluck {'264': [18, 30]}
gluckian {'264': [183]}
glut {'3489': [151]}
glycol {'3703': [95]}
glyn {'3500': [57]}
glyndebourn {'265': [32, 46, 55]}
gm {'242': [27, 84, 97, 119], '3388': [296], '3660': [301, 326], '3931': [404]}
gmf {'129': [183]}
gmt {'52': [32]}
gnarl {'3640': [194]}
gnp {'287': [56, 116, 148, 172, 211, 435, 468, 476, 499, 507, 530, 538, 561, 569, 592, 600, 624, 632, 651, 687], '3405': [133, 152], '3440': [336], '3442': [275], '3443': [293], '3522': [41, 90, 122, 189], '3666': [115, 121], '3692': [352]}
go {'2': [50], '16': [121], '17': [156, 172], '34': [96], '36': [101], '42': [116], '48': [47, 96, 124], '51': [23], '59': [424], '78': [46], '97': [5], '101': [478], '127': [266], '131': [434], '132': [211], '133': [359], '134': [210, 556], '136': [284], '137': [583], '139': [316], '14

huge {'48': [319], '58': [238], '91': [68], '113': [49], '129': [270], '141': [407], '186': [97], '196': [238], '254': [83], '264': [483], '295': [85], '308': [390], '340': [13], '354': [490], '370': [276], '3398': [350], '3410': [46], '3436': [88], '3439': [122, 248, 403], '3443': [364], '3444': [267], '3598': [55], '3623': [213], '3641': [201], '3663': [58], '3724': [533], '3726': [105], '3785': [182], '3793': [411, 611]}
hugh {'27': [75], '60': [292], '78': [59, 299, 315], '126': [199], '366': [446], '3337': [99], '3368': [135], '3513': [36], '3795': [262]}
hugo {'162': [6, 13, 55, 81, 128, 147], '3514': [18], '3926': [567]}
huguenot {'3361': [122]}
hui {'3893': [54]}
hulett {'3693': [975]}
hull {'133': [217, 233, 310, 325, 378]}
hulm {'3512': [105], '3647': [202]}
hum {'3856': [367]}
human {'4': [19], '132': [400], '135': [19, 39, 65, 79, 104, 112, 138, 142, 176, 637, 645, 838], '140': [11, 663], '176': [100], '189': [122], '204': [36], '266': [177], '354': [28], '3356': [154], '33

jouster {'3366': [540]}
jox {'3545': [99]}
joy {'14': [11], '193': [227]}
joyn {'170': [23]}
joynson {'200': [641]}
joyou {'193': [180]}
jozef {'3856': [297]}
jp {'14': [179], '200': [160, 178, 212, 224, 279, 327, 508, 591], '220': [46], '3579': [167]}
ju {'3440': [211]}
jubil {'241': [635]}
jubile {'1': [4]}
judaism {'3360': [59, 341]}
judg {'78': [319], '109': [6, 65, 72], '116': [59], '169': [151], '186': [267], '194': [138], '226': [491], '255': [174, 517, 784], '269': [35], '286': [238], '309': [110, 143], '328': [545, 576], '335': [50], '366': [262], '3329': [260, 336], '3374': [119], '3377': [16, 44], '3418': [271], '3457': [21], '3495': [182], '3526': [5, 33, 201, 218], '3637': [80], '3644': [303, 376, 474], '3645': [279], '3646': [118, 174], '3670': [156], '3752': [10, 57, 75], '3849': [6], '3878': [78], '3895': [180], '3907': [284]}
judgement {'3843': [113]}
judgment {'73': [27, 37], '89': [82], '200': [633], '254': [150], '283': [133], '286': [217], '354': [484], '3360': [40

marshal {'349': [18, 22], '3449': [390], '3739': [148], '3856': [296], '3886': [143, 212], '3914': [478]}
marsham {'3533': [586], '3654': [351], '3734': [373]}
marston {'3633': [130]}
marszalkowska {'3357': [95]}
mart {'3570': [241]}
marten {'3677': [48]}
martian {'3910': [231]}
martin {'17': [69], '35': [228], '57': [174], '141': [244], '192': [8, 94], '207': [74], '339': [19], '347': [44, 92], '351': [119], '3459': [809], '3503': [950], '3505': [46], '3529': [146], '3623': [43], '3646': [104], '3694': [252], '3707': [6, 8], '3726': [7], '3769': [95], '3795': [228], '3844': [33], '3856': [687], '3908': [249, 293, 317], '3909': [459]}
martinu {'194': [18, 65, 86, 196, 277]}
martn {'100': [70]}
martynov {'112': [140]}
martyrdom {'3366': [5, 116]}
marvel {'193': [219], '3912': [342]}
marwick {'366': [139], '3795': [223]}
marxism {'339': [12], '3446': [496]}
marxist {'356': [119], '3437': [640], '3443': [597], '3444': [447]}
maryland {'3716': [335]}
marylebon {'3693': [959]}
masao {'135':

nasti {'3412': [73], '3416': [151], '3633': [20], '3913': [204]}
nat {'174': [59], '351': [989, 1040, 1171], '3459': [795, 913], '3693': [925, 1049]}
natali {'264': [193]}
nation {'15': [157], '16': [557], '28': [16], '40': [145, 190], '45': [164], '54': [164], '56': [372], '60': [216], '61': [11], '65': [151], '68': [68], '71': [46], '72': [41], '76': [171], '78': [15], '91': [32], '92': [52], '94': [78], '98': [88], '101': [189, 468], '102': [13], '104': [17], '112': [169], '116': [40, 207, 223], '123': [16], '126': [9], '127': [18], '129': [33], '130': [375, 392], '134': [510], '144': [275], '148': [196], '163': [160], '165': [166], '174': [16], '184': [27, 111], '186': [261, 386], '188': [328, 585], '193': [56, 147], '204': [12], '205': [11], '206': [49], '207': [79], '210': [36, 44, 194], '216': [208], '219': [89], '222': [77], '226': [189], '235': [13], '238': [20], '241': [469], '248': [315, 346], '251': [203], '252': [39, 127, 215], '255': [221, 354], '259': [54], '261': [95], 

pen {'3500': [28], '3576': [24], '3871': [152], '3935': [627, 639], '3938': [322]}
penal {'3644': [299]}
penalis {'262': [793], '3407': [435]}
penalti {'49': [200], '241': [445], '273': [93, 122], '3329': [697], '3353': [35], '3450': [167], '3484': [167], '3592': [88, 98, 111], '3704': [88, 98, 111], '3785': [341], '3923': [527]}
pencil {'283': [34], '3871': [141, 336]}
pend {'289': [41], '309': [376], '3331': [206], '3539': [84], '3549': [97], '3811': [164, 184], '3840': [18]}
pendant {'193': [95], '3728': [489]}
pendl {'3529': [172]}
pendleton {'274': [131, 187, 234]}
penetr {'129': [620], '226': [180], '3495': [512], '3640': [356], '3729': [377], '3907': [144], '3919': [45], '3921': [427], '3922': [424]}
peng {'3436': [254], '3437': [32], '3443': [335, 491, 554, 592], '3446': [191], '3448': [512, 589], '3449': [318]}
penguin {'3363': [166], '3912': [122]}
pennel {'31': [135]}
penni {'78': [87], '141': [207, 329, 618], '3348': [109], '3459': [627, 846], '3862': [44]}
pennin {'364': [

qualiti {'45': [141], '49': [43], '55': [778, 796, 876, 880], '96': [63], '130': [509], '131': [373, 599], '134': [199], '136': [32, 276], '140': [36, 52, 162, 169, 177, 210, 540], '141': [291], '149': [54], '150': [85], '194': [106], '212': [55], '226': [288, 502], '241': [790], '242': [99], '255': [779], '327': [569], '340': [37, 101], '341': [12, 71, 275, 661], '345': [64], '366': [276], '367': [141, 235], '371': [414, 432], '374': [187], '3360': [266], '3362': [75], '3370': [73], '3388': [89], '3396': [373], '3398': [109], '3406': [537], '3434': [505], '3435': [382], '3437': [428, 631], '3443': [563], '3444': [374], '3464': [78], '3522': [197], '3641': [252, 256, 303], '3643': [415], '3645': [553], '3646': [263], '3654': [173], '3723': [54], '3726': [221, 670], '3727': [296, 309], '3728': [223, 243], '3772': [210], '3782': [176, 255], '3786': [48, 164, 179, 440], '3790': [159], '3793': [514, 605], '3801': [63], '3828': [173], '3861': [55], '3867': [379, 391, 413], '3908': [22, 261]

savour {'290': [3], '368': [410]}
savoy {'351': [369, 615, 696, 908]}
saw {'21': [265], '35': [200], '55': [377], '56': [103], '101': [252], '176': [116], '193': [24], '207': [20, 155], '216': [106], '232': [22], '264': [266, 365], '283': [9, 126], '300': [27], '3324': [87], '3331': [179], '3342': [18], '3343': [23], '3347': [56, 63], '3366': [727], '3527': [443], '3537': [71], '3551': [152], '3570': [388], '3582': [117], '3589': [40], '3590': [122], '3601': [47], '3606': [83], '3624': [20], '3629': [177], '3639': [525], '3645': [87, 502], '3658': [86, 140], '3694': [333], '3695': [431, 457], '3715': [120], '3718': [197], '3726': [965], '3729': [382], '3733': [41], '3750': [12], '3775': [12], '3812': [63], '3827': [118], '3864': [76], '3865': [287], '3870': [152], '3895': [531], '3919': [421], '3920': [203], '3924': [247, 250, 259, 273]}
sawt {'3911': [247]}
saxena {'3543': [85, 292, 353]}
saxon {'3865': [112]}
saxoni {'3771': [203], '3901': [175]}
say {'17': [213, 237], '19': [103, 19

spectacular {'3396': [397], '3937': [248, 374], '3938': [233]}
spector {'3535': [102, 433]}
spectr {'282': [25], '3406': [184]}
spectrascan {'3826': [87]}
spectrum {'255': [294], '3507': [60]}
specul {'15': [77, 205], '35': [150], '66': [10], '141': [550], '144': [370], '145': [279], '186': [28], '215': [127], '282': [103], '304': [117], '337': [208], '374': [353, 366], '3336': [261], '3438': [484], '3459': [96], '3465': [193], '3473': [36], '3503': [79], '3566': [34], '3569': [57], '3598': [87, 213], '3693': [519, 534, 566, 676], '3695': [250], '3714': [67], '3790': [621], '3796': [189], '3797': [48], '3825': [73, 119]}
speech {'46': [144, 174], '102': [36], '282': [122], '305': [231], '375': [62], '3398': [394], '3427': [26], '3443': [605], '3458': [41], '3490': [152], '3546': [40, 72], '3653': [21], '3719': [226], '3804': [142], '3876': [113], '3894': [236], '3895': [236], '3896': [340], '3916': [596]}
speed {'16': [260], '48': [59, 116, 211], '55': [549], '129': [311, 357], '201': 

toilet {'3421': [137]}
toiletri {'3710': [17], '3812': [12]}
token {'3913': [357], '3918': [392, 423]}
tokio {'3570': [196]}
tokyo {'21': [451], '128': [32, 74], '135': [130, 258, 502], '144': [34], '151': [152, 198], '213': [113], '244': [107], '280': [151], '348': [133], '3353': [51], '3384': [6, 12, 74], '3412': [56, 143, 244, 584], '3468': [326], '3559': [32, 74], '3570': [45, 290], '3642': [118], '3701': [247, 261], '3756': [30], '3766': [29], '3774': [32, 74], '3796': [33], '3823': [115], '3913': [589], '3931': [251, 414], '3932': [104, 234, 458, 598], '3935': [114], '3945': [7]}
told {'38': [29], '46': [53], '48': [94], '63': [25], '67': [27], '74': [39, 82], '75': [76], '86': [11], '90': [155], '98': [80], '111': [112], '125': [184], '127': [45], '165': [84], '177': [43], '181': [33], '208': [77], '223': [43], '253': [30, 113, 133, 275], '264': [502], '296': [64], '298': [30], '327': [367], '341': [589], '3341': [36], '3344': [160], '3347': [82, 132], '3350': [34], '3351': [58,

vejsnik {'115': [21]}
veljnowski {'255': [188]}
veloc {'105': [58]}
velvet {'3415': [52]}
vendor {'130': [423], '139': [368], '174': [36], '189': [33], '3329': [515, 568, 652], '3357': [61], '3908': [109, 595], '3911': [370], '3914': [147, 200, 208, 274, 464], '3917': [194], '3919': [316, 361], '3921': [257], '3933': [344]}
venezuela {'2': [91], '215': [6, 9], '3466': [6, 10, 23, 71], '3808': [3, 8, 101]}
venezuelan {'25': [7, 14, 73], '215': [15], '3338': [102], '3382': [31, 52], '3466': [42, 72], '3808': [37]}
veng {'3409': [257]}
vengeanc {'3793': [199]}
venic {'3361': [131]}
venizelo {'3402': [57]}
venkitaramanan {'3544': [41]}
vent {'241': [360], '3693': [1033]}
ventur {'18': [88], '34': [114], '35': [105, 271], '91': [46], '129': [186], '141': [830], '143': [168, 179, 193], '150': [37, 107, 125], '174': [75], '180': [21, 51, 87, 100, 117], '182': [10, 34], '186': [549], '197': [185], '198': [448, 507], '226': [196, 220, 479], '247': [152], '285': [198], '337': [209], '374': [416]

In [114]:
file = open('hasil indexing setelah preprocessing standar.txt','w')
for key, value in proximity_index.items():
    file.write(key+'\n')
    for key, value in value.items():
        file.write('\t'+str(key)+': ')
        for i in range (len(value)):
            file.write(str(value[i]))
            if not(i == len(value)-1):
                file.write(',')
        file.write('\n')
    file.write('\n')
file.close()  

In [115]:
for key,value in enumerate (proximity_index.items(),1):
    print (key)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475


6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474


9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512


12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
1243

13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
1393

In [116]:
try:
    from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
    try:
        from itertools import izip as zip # < 2.5 or 3.x
    except ImportError:
        pass

In [117]:
import itertools
zip = getattr(itertools, 'izip', zip)

Implementasi Searching

In [118]:
collection_matrix = {}
for token in all_tokens:
    matrix = [] 
    for i in range(N_DOC):
        if(token in tokens_doc[i]):
            matrix.append('1')
        else:
            matrix.append('0')
    collection_matrix[token] = matrix

q1: flicker

In [119]:
print('Hasil Query flicker Terdapat Pada Nomor Dokumen :')
for key,value in proximity_index['flicker'].items():
    print (key,value)

Hasil Query flicker Terdapat Pada Nomor Dokumen :
3438 [74]


q2: window

In [120]:
print('Hasil Query window Terdapat Pada Nomor Dokumen :')
for key,value in proximity_index['window'].items():
    print (key,value)

Hasil Query window Terdapat Pada Nomor Dokumen :
30 [25, 160]
55 [339]
59 [9, 106, 133, 149, 170, 193, 203, 226, 295, 323, 471, 482, 501, 513, 535, 552, 568, 583, 590, 594, 605, 619, 621, 625, 643, 663, 673, 675]
145 [159]
160 [128]
272 [63, 298, 433]
374 [346]
3463 [73, 641]
3639 [107]
3782 [367]
3788 [314]
3906 [291, 302, 312, 394]
3909 [377]
3910 [173, 181]
3917 [460]
3930 [6, 25, 61, 71, 84, 98, 251]


q3: withdraw

In [121]:
print('Hasil Query withdraw Terdapat Pada Nomor Dokumen :')
for key,value in proximity_index['withdraw'].items():
    print (key,value)

Hasil Query withdraw Terdapat Pada Nomor Dokumen :
76 [188]
110 [67]
132 [238]
187 [118]
205 [28]
223 [69]
286 [143]
288 [64, 85]
305 [123]
355 [18]
3329 [744]
3364 [88]
3393 [58]
3413 [39]
3496 [66]
3521 [165]
3539 [19]
3632 [16, 48, 57]
3887 [53, 185]
3944 [24]


q4: condemning

In [122]:
print('Hasil Query condemning Terdapat Pada Nomor Dokumen :')
for key,value in proximity_index['condemn'].items():
    print (key,value)

Hasil Query condemning Terdapat Pada Nomor Dokumen :
113 [80]
309 [392]
354 [449, 480]
3374 [87]
3674 [137]
3678 [31]


Boolean Search

In [123]:
collection_matrix = {}
for token in all_tokens:
    matrix = []
    for i in range(N_DOC):
        if(token in tokens_doc[i]):
            matrix.append('1')
        else:
            matrix.append('0')
    collection_matrix[token] = matrix

q5: wall OR street

In [126]:
wall_biner = ''.join(collection_matrix['wall'])
street_biner = ''.join(collection_matrix['street'])
print ('query: wall OR street')
print (wall_biner+'\nOR\n'+street_biner)
str_result = format(int(wall_biner, 2) | int(street_biner, 2), '01000b')
print ('=\n'+str_result+'\n hasil query wall OR street terdapat pada dokumen nomor : ')
str_result_list = [i for i in str_result]
str_result_doc_no = []
for n in [i for i, j in zip(count(), str_result_list) if j == '1']:
    str_result_doc_no.append(all_doc_no[n].firstChild.data)
print (u', '.join(str_result_doc_no))

query: wall OR street
000000000001110000000000000000000000000000010000000000010000000000000000000000000000000000000000000000001000000000000100000000100000000000000001100000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000010000000000000000000000010000000000000000010000000000000000010000001010000000101000000000000000010000011000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000100000001000000000100000000000000000000000000000000000000000101000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000110000000000000000000000100100000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000101100000000000000000000100000000000010000000000000000000000000000000000000000000000000000000000000000110100000000000000000000000000000000000000000000000000000100000000000001000000000000000000000000010000000000000000000000000000000000

q6: wall AND street

In [127]:
wall_biner = ''.join(collection_matrix['wall'])
street_biner = ''.join(collection_matrix['street'])
print ('query: wall AND street')
print (wall_biner+'\nAND\n'+street_biner)
str_result = format (int(wall_biner, 2) & int(street_biner, 2), '01000b')
print ('=\n'+str_result+'\nHasil query wall AND street terdapat pada dokumen nomor : ')
str_result_list = [i for i in str_result]
str_result_doc_no = []
for n in [i for i, j in zip(count(), str_result_list) if j == '1']:
    str_result_doc_no.append(all_doc_no[n].firstChild.data)
print (u', '.join(str_result_doc_no))

query: wall AND street
00000000000111000000000000000000000000000001000000000001000000000000000000000000000000000000000000000000100000000000010000000010000000000000000110000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000001000000000000000000000001000000000000000001000000000000000001000000101000000010100000000000000001000001100000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000010000000100000000010000000000000000000000000000000000000000010100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000011000000000000000000000010010000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000010110000000000000000000010000000000001000000000000000000000000000000000000000000000000000000000000000011010000000000000000000000000000000000000000000000000000010000000000000100000000000000000000000001000000000000000000000000000000000

Phrase Search Bi-gram 

In [128]:
bi_gram_tokens = []
bi_gram_sentence_doc = []

for n in range(N_DOC):
    token = []
    for i in range(len(tokens_doc[n])):
        if not(i == len(tokens_doc[n])-1):            
            token.append(tokens_doc[n][i]+'_'+tokens_doc[n][i+1])
            bi_gram_tokens.append(tokens_doc[n][i]+'_'+tokens_doc[n][i+1])
    bi_gram_sentence_doc.append(' '.join(token))

In [129]:
bi_gram_index = {}
for bigram_token in bi_gram_tokens:
    doc_no = []
    for i in range(N_DOC):
        if(bigram_token in bi_gram_sentence_doc[i]):
            doc_no.append(all_doc_no[i].firstChild.data)
    bi_gram_index[bigram_token] = doc_no

q7: "base rate"

In [130]:
print('Hasil Query base rate terdapat pada nomor dokumen :')
print (u',' .join(bi_gram_index['base_rate']))

Hasil Query base rate terdapat pada nomor dokumen :
12,15,21,55,118,125,148,268,304,313,328,331,3351,3458,3571,3799


q8: #10(base, rate)

In [131]:
def proximity_search(a,b,w):
    prox_doc = []
    for i in range(N_DOC):
        if(a in tokens_doc[i] and b in tokens_doc[i]):
            for key1,value1 in proximity_index[a].items():
                if(key1 == all_doc_no[i].firstChild.data):
                    for v1 in value1:
                        for key2,value2 in proximity_index[b].items():
                            if(key2 == all_doc_no[i].firstChild.data):
                                for v2 in value2:
                                    if(abs(v1-v2) < w):
                                        prox_doc.append(all_doc_no[i].firstChild.data)
                                        break
    return prox_doc

In [132]:
result = set(proximity_search('base','rate',10))
print('Hasil query #10(base, rate) terdapat pada nomor dokumen :')
print (u', '.join(result))

Hasil query #10(base, rate) terdapat pada nomor dokumen :
3799, 125, 328, 3494, 313, 3351, 304, 118, 3458, 3571, 21, 12, 65, 3659, 331, 55, 15, 3530, 148, 314, 268, 3535


q9: "wall street" AND open

In [133]:
result_wall_street = bi_gram_index['wall_street']
biner_wall_street = ''
for i in range(N_DOC):
    if(all_doc_no[i].firstChild.data in result_wall_street):
        biner_wall_street+='1'
    else:
        biner_wall_street+='0'
biner_open = ''.join(collection_matrix['open'])
print ('wall street')
print (biner_wall_street)
print ('AND')
print ('open')
print ('wall_street')
str_result = format(int(biner_wall_street, 2) & int(biner_open, 2), '01000b')
print ('=\n'+str_result+'\nMaka Hasil query "wall street" AND open terdapat pada nomor dokumen : ')
str_result_list = [i for i in str_result]
str_result_doc_no = []
for n in [i for i, j in zip(count(), str_result_list) if j == '1']:
    str_result_doc_no.append(all_doc_no[n].firstChild.data)
print (u', '.join(str_result_doc_no))

wall street
0000000000011100000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000011000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000010100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000001010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000001100000000000000000000001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001011000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000001001000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000