In [20]:
def tokenisasi(text):
    tokens = text.split(" ")
    return tokens

def stemming(text):
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    # stemming process
    output = stemmer.stem(text)
    return output

def stemming_sentence(text):
    output = ""
    for token in tokenisasi(text):
        output = output + stemming(token) + " "
    return output[:-1]

In [21]:
doc_dict_raw = {}
doc_dict_raw['doc1'] = "pengembangan sistem informasi penjadwalan"
doc_dict_raw['doc2'] = "pengembangan model analisis sentimen berita"
doc_dict_raw['doc3'] = "analisis sistem input output"
doc_dict_raw['doc4'] = "pengembangan sistem informasi akademik universitas"
doc_dict_raw['doc5'] = "pengembangan sistem cari berita ekonomi"
doc_dict_raw['doc6'] = "analisis sistem neraca nasional"
doc_dict_raw['doc7'] = "pengembangan sistem informasi layanan statistik"
doc_dict_raw['doc8'] = "pengembangan sistem pencarian skripsi di universitas"
doc_dict_raw['doc9'] = "analisis sentimen publik terhadap pemerintah"
doc_dict_raw['doc10'] = "pengembangan model klasifikasi sentimen berita"
doc_dict = {}
for doc_id,doc in doc_dict_raw.items():
    doc_dict[doc_id] = stemming_sentence(doc)

query = "sistem informasi statistik"
tokenized_query = tokenisasi(query)

In [22]:
# standar query likelihood model
likelihood_scores = {}
vocab = set()
for doc_id in doc_dict.keys():
    likelihood_scores[doc_id] = 1
    tokens = tokenisasi(doc_dict[doc_id])
    vocab.update(tokens)
    for q in tokenized_query:
        likelihood_scores[doc_id]=likelihood_scores[doc_id]*tokens.count(q)/len(tokens)
        
print(likelihood_scores)

from collections import OrderedDict

def exact_top_k(doc_dict, rank_score, k):
    relevance_scores = {}
    # mengubah rank_score menjadi list
    rank_score = list(rank_score.values())
    i = 0
    for doc_id in doc_dict.keys():
        relevance_scores[doc_id] = rank_score[i]
        i = i + 1

    sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
    top_k = {j: sorted_value[j] for j in list(sorted_value)[:k]}
    return top_k

top_5 = exact_top_k(doc_dict, likelihood_scores, 5)
print("\nTop 5 Dokumen:")
print(top_5)

{'doc1': 0.0, 'doc2': 0.0, 'doc3': 0.0, 'doc4': 0.0, 'doc5': 0.0, 'doc6': 0.0, 'doc7': 0.008, 'doc8': 0.0, 'doc9': 0.0, 'doc10': 0.0}

Top 5 Dokumen:
{'doc7': 0.008, 'doc1': 0.0, 'doc2': 0.0, 'doc3': 0.0, 'doc4': 0.0}


In [23]:
tokenized_corpus = [j for sub in [tokenisasi(doc_dict[doc_id]) for doc_id in doc_dict] for j in sub]
vocab = set(tokenized_corpus)

print(vocab)

{'skripsi', 'jadwal', 'output', 'ekonomi', 'neraca', 'cari', 'statistik', 'universitas', 'di', 'perintah', 'informasi', 'kembang', 'nasional', 'analisis', 'model', 'berita', 'klasifikasi', 'sistem', 'akademik', 'sentimen', 'publik', 'layan', 'hadap', 'input'}


In [24]:
# Laplace Smoothing pada query likelihood model
alpha = 1
likelihood_scores = {}
for doc_id in doc_dict.keys():
    likelihood_scores[doc_id] = 1
    tokens = tokenisasi(doc_dict[doc_id])
    for q in tokenized_query:
        likelihood_scores[doc_id]=likelihood_scores[doc_id]*(tokens.count(q)+alpha)/(len(tokens)+len(vocab)*alpha)

print(likelihood_scores)

top_5 = exact_top_k(doc_dict, likelihood_scores, 5)
print("\nTop 5 Dokumen:")
print(top_5)

{'doc1': 0.00018221574344023323, 'doc2': 4.1002091106646436e-05, 'doc3': 9.110787172011662e-05, 'doc4': 0.00016400836442658574, 'doc5': 8.200418221329287e-05, 'doc6': 9.110787172011662e-05, 'doc7': 0.0003280167288531715, 'doc8': 7.407407407407407e-05, 'doc9': 4.1002091106646436e-05, 'doc10': 4.1002091106646436e-05}

Top 5 Dokumen:
{'doc7': 0.0003280167288531715, 'doc1': 0.00018221574344023323, 'doc4': 0.00016400836442658574, 'doc3': 9.110787172011662e-05, 'doc6': 9.110787172011662e-05}


In [25]:
# Jelinek-Mercer Smoothing pada query likelihood model
lamda = 0.5
likelihood_scores = {}
for doc_id in doc_dict.keys():
    likelihood_scores[doc_id] = 1
    tokens = tokenisasi(doc_dict[doc_id])
    for q in tokenized_query:
        likelihood_scores[doc_id]=likelihood_scores[doc_id]*((lamda*tokens.count(q)/len(tokens))+((1-lamda)*tokenized_corpus.count(q)/len(tokenized_corpus)))

print(likelihood_scores)

top_5 = exact_top_k(doc_dict, likelihood_scores, 5)
print("\nTop 5 Dokumen:")
print(top_5)

{'doc1': 0.0003221299913194445, 'doc2': 2.3735894097222223e-05, 'doc3': 6.442599826388889e-05, 'doc4': 0.00023640950520833334, 'doc5': 5.628797743055555e-05, 'doc6': 6.442599826388889e-05, 'doc7': 0.0025059407552083337, 'doc8': 5.086263020833333e-05, 'doc9': 2.3735894097222223e-05, 'doc10': 2.3735894097222223e-05}

Top 5 Dokumen:
{'doc7': 0.0025059407552083337, 'doc1': 0.0003221299913194445, 'doc4': 0.00023640950520833334, 'doc3': 6.442599826388889e-05, 'doc6': 6.442599826388889e-05}


In [26]:
# Dirichlet Smoothing pada query likelihood model
miu = 2
likelihood_scores = {}
for doc_id in doc_dict.keys():
    likelihood_scores[doc_id] = 1
    tokens = tokenisasi(doc_dict[doc_id])
    for q in tokenized_query:
        likelihood_scores[doc_id]=likelihood_scores[doc_id]*(tokens.count(q)+miu*tokenized_corpus.count(q)/len(tokenized_corpus))/(len(tokens)+miu)

print(likelihood_scores)

top_5 = exact_top_k(doc_dict, likelihood_scores, 5)
print("\nTop 5 Dokumen:")
print(top_5)

{'doc1': 0.0002803096064814815, 'doc2': 4.428854875283447e-06, 'doc3': 3.114551183127572e-05, 'doc4': 0.00017652150145772597, 'doc5': 1.961350016196955e-05, 'doc6': 3.114551183127572e-05, 'doc7': 0.004413037536443149, 'doc8': 1.3139512803819445e-05, 'doc9': 4.428854875283447e-06, 'doc10': 4.428854875283447e-06}

Top 5 Dokumen:
{'doc7': 0.004413037536443149, 'doc1': 0.0002803096064814815, 'doc4': 0.00017652150145772597, 'doc3': 3.114551183127572e-05, 'doc6': 3.114551183127572e-05}
