In [2]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id in doc_dict.keys():
        tf_docs[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return tf_docs

def tokenisasi(text):
    tokens = text.split(" ")
    return tokens


In [6]:

doc1_term = ["pengembangan", "sistem", "informasi", "penjadwalan"]
doc2_term = ["pengembangan", "model", "analisis", "sentimen", "berita"]
doc3_term = ["analisis", "sistem", "input", "output"]
corpus_term = [doc1_term, doc2_term, doc3_term]

inverted_index = {}
for i in range(len(corpus_term)):
    for item in corpus_term[i]:
        stemmer = StemmerFactory().create_stemmer()
        item = stemmer.stem(item)
        if item not in inverted_index:
            inverted_index[item] = []
        if item in inverted_index:
            inverted_index[item].append(i+1)

vocab=list(inverted_index.keys())
doc_dict = {}

#clean after stemming
doc_dict['doc1'] = "kembang sistem informasi jadwal"
doc_dict['doc2'] = "kembang model analisis sentimen berita"
doc_dict['doc3'] = "analisis sistem input output"

print(termFrequencyInDoc(vocab, doc_dict))

{}


In [4]:
def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
            if word in tokenisasi(doc):
                frq = frq + 1
        df[word] = frq
    return df

import numpy as np
def inverseDocFre(vocab,doc_fre,length):
    idf= {}
    for word in vocab:
        idf[word] = idf[word] = 1 + np.log((length + 1) / (doc_fre[word]+1))
    return idf

print(inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)))

{'kembang': 1.2876820724517808, 'sistem': 1.2876820724517808, 'informasi': 1.6931471805599454, 'jadwal': 1.6931471805599454, 'model': 1.6931471805599454, 'analisis': 1.2876820724517808, 'sentimen': 1.6931471805599454, 'berita': 1.6931471805599454, 'input': 1.6931471805599454, 'output': 1.6931471805599454}


In [5]:
def tfidf(vocab,tf,idf_scr,doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, doc_dict), inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)), doc_dict)
# Term - Document Matrix
TD = np.zeros((len(vocab), len(doc_dict)))
for word in vocab:
    for doc_id,doc in tf_idf.items():
        ind1 = vocab.index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]
print(TD)

[[1.28768207 1.28768207 0.        ]
 [1.28768207 0.         1.28768207]
 [1.69314718 0.         0.        ]
 [1.69314718 0.         0.        ]
 [0.         1.69314718 0.        ]
 [0.         1.28768207 1.28768207]
 [0.         1.69314718 0.        ]
 [0.         1.69314718 0.        ]
 [0.         0.         1.69314718]
 [0.         0.         1.69314718]]


In [7]:
def edit_distance(string1, string2):
    if len(string1) > len(string2):
        difference = len(string1) - len(string2)
        string1[:difference]
        n = len(string2)
    elif len(string2) > len(string1):
        difference = len(string2) - len(string1)
        string2[:difference]
        n = len(string1)
    for i in range(n):
        if string1[i] != string2[i]:
            difference += 1

    return difference

print(edit_distance(doc_dict['doc1'], doc_dict['doc2']))
print(edit_distance(doc_dict['doc1'], doc_dict['doc3']))

30
31


In [8]:
def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection

    return float(intersection) / union

print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc2'].split(" ")))
print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc3'].split(" ")))

0.125
0.14285714285714285


In [9]:
def euclidian_dist(vec1, vec2):
    # subtracting vector
    temp = vec1 - vec2
    # doing dot product
    # for finding
    # sum of the squares
    sum_sq = np.dot(temp.T, temp)
    # Doing squareroot and
    # printing Euclidean distance
    return np.sqrt(sum_sq)

print(euclidian_dist(TD[:, 0], TD[:, 1])) #doc1 & doc2
print(euclidian_dist(TD[:, 0], TD[:, 2])) #doc1 & doc3

4.201188773980276
3.844897884155026


In [10]:
import math
def cosine_sim(vec1, vec2):
    vec1 = list(vec1)
    vec2 = list(vec2)
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)

print(cosine_sim(TD[:, 0], TD[:, 1])) #doc1 & doc2
print(cosine_sim(TD[:, 0], TD[:, 2])) #doc1 & doc3

0.15967058203849993
0.1832234081332565
