In [1]:
# TF-IDF

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import math

In [2]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [3]:
# Mendapatkan list yang berisi kata-kata yang sudah di-tokenize dan stemmed.

def get_stemmed_and_tokenize_word(list_of_sentence):
    list_of_word = []
    
    for sentence in list_of_sentence:
        for word in stemmer.stem(sentence).split(" "):
            if word not in list_of_word:
                list_of_word.append(word)
            
    return list_of_word

In [4]:
kata_kunci = "pengetahuan logistik"

In [5]:
# Dokumen

dokumen_1 = "manajemen transaksi logistik"
dokumen_2 = "pengetahuan antar individu"
dokumen_3 = "dalam manajemen pengetahuan terdapat transfer pengetahuan logistik"

In [6]:
# List dari seluruh dokumen

list_of_sentence = [dokumen_1, dokumen_2, dokumen_3]

In [7]:
# Word yang sudah di-tokenize dan stemmed

list_of_word = get_stemmed_and_tokenize_word(list_of_sentence)

In [8]:
list_of_word

['manajemen',
 'transaksi',
 'logistik',
 'tahu',
 'antar',
 'individu',
 'dalam',
 'dapat',
 'transfer']

In [9]:
term_frequency = []

for i in range(4):
    term_frequency.append(dict(zip(list_of_word, [0 for _ in range(len(list_of_word))])))

In [10]:
term_frequency

[{'manajemen': 0,
  'transaksi': 0,
  'logistik': 0,
  'tahu': 0,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 0,
  'transaksi': 0,
  'logistik': 0,
  'tahu': 0,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 0,
  'transaksi': 0,
  'logistik': 0,
  'tahu': 0,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 0,
  'transaksi': 0,
  'logistik': 0,
  'tahu': 0,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0}]

In [11]:
for index, sentence in enumerate([kata_kunci, dokumen_1, dokumen_2, dokumen_3]):
    for word in stemmer.stem(sentence).split(" "):
        if word in term_frequency[index]:
            term_frequency[index][word] += 1

In [12]:
term_frequency

[{'manajemen': 0,
  'transaksi': 0,
  'logistik': 1,
  'tahu': 1,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 1,
  'transaksi': 1,
  'logistik': 1,
  'tahu': 0,
  'antar': 0,
  'individu': 0,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 0,
  'transaksi': 0,
  'logistik': 0,
  'tahu': 1,
  'antar': 1,
  'individu': 1,
  'dalam': 0,
  'dapat': 0,
  'transfer': 0},
 {'manajemen': 1,
  'transaksi': 0,
  'logistik': 1,
  'tahu': 2,
  'antar': 0,
  'individu': 0,
  'dalam': 1,
  'dapat': 1,
  'transfer': 1}]

In [13]:
document_frequency = dict(zip(list_of_word, [0 for _ in range(len(list_of_word))]))

In [14]:
document_frequency

{'manajemen': 0,
 'transaksi': 0,
 'logistik': 0,
 'tahu': 0,
 'antar': 0,
 'individu': 0,
 'dalam': 0,
 'dapat': 0,
 'transfer': 0}

In [15]:
for index, document in enumerate(term_frequency):
    if index > 0:
        for key, value in document.items():
            if value:
                document_frequency[key] += 1

In [16]:
document_frequency

{'manajemen': 2,
 'transaksi': 1,
 'logistik': 2,
 'tahu': 2,
 'antar': 1,
 'individu': 1,
 'dalam': 1,
 'dapat': 1,
 'transfer': 1}

In [17]:
d_df = {}

In [18]:
for key, value in document_frequency.items():
    d_df[key] = len(list_of_sentence) / value

In [19]:
d_df

{'manajemen': 1.5,
 'transaksi': 3.0,
 'logistik': 1.5,
 'tahu': 1.5,
 'antar': 3.0,
 'individu': 3.0,
 'dalam': 3.0,
 'dapat': 3.0,
 'transfer': 3.0}

In [20]:
idf = {}

In [21]:
for key, value in d_df.items():
    idf[key] = round(math.log10(value), 3)

In [22]:
idf

{'manajemen': 0.176,
 'transaksi': 0.477,
 'logistik': 0.176,
 'tahu': 0.176,
 'antar': 0.477,
 'individu': 0.477,
 'dalam': 0.477,
 'dapat': 0.477,
 'transfer': 0.477}

In [23]:
W_q_t = []

In [24]:
for index, document in enumerate(term_frequency):
    W_q_t.append({})
    
    for key, value in document.items():
        W_q_t[index][key] = value * idf[key]

In [25]:
W_q_t

[{'manajemen': 0.0,
  'transaksi': 0.0,
  'logistik': 0.176,
  'tahu': 0.176,
  'antar': 0.0,
  'individu': 0.0,
  'dalam': 0.0,
  'dapat': 0.0,
  'transfer': 0.0},
 {'manajemen': 0.176,
  'transaksi': 0.477,
  'logistik': 0.176,
  'tahu': 0.0,
  'antar': 0.0,
  'individu': 0.0,
  'dalam': 0.0,
  'dapat': 0.0,
  'transfer': 0.0},
 {'manajemen': 0.0,
  'transaksi': 0.0,
  'logistik': 0.0,
  'tahu': 0.176,
  'antar': 0.477,
  'individu': 0.477,
  'dalam': 0.0,
  'dapat': 0.0,
  'transfer': 0.0},
 {'manajemen': 0.176,
  'transaksi': 0.0,
  'logistik': 0.176,
  'tahu': 0.352,
  'antar': 0.0,
  'individu': 0.0,
  'dalam': 0.477,
  'dapat': 0.477,
  'transfer': 0.477}]

In [26]:
bobot_kata_kunci = []

In [27]:
for index, token in enumerate(W_q_t):
    if index > 0:
        bobot_kata_kunci.append({})
        
        for word in stemmer.stem(kata_kunci).split(" "):
            bobot_kata_kunci[index - 1][word] = token[word]

In [28]:
bobot_kata_kunci

[{'tahu': 0.0, 'logistik': 0.176},
 {'tahu': 0.176, 'logistik': 0.0},
 {'tahu': 0.352, 'logistik': 0.176}]

In [29]:
def get_bobot_dokumen(list_bobot_dokumen):
    bobot_dokumen = []
    
    for index, dokumen in enumerate(list_bobot_dokumen):
        total = 0
        
        for key, value in dokumen.items():
            total += value
        
        bobot_dokumen.append({ f"bobot_dokumen_{index + 1}": total })
    
    return bobot_dokumen

In [30]:
bobot_dokumen = get_bobot_dokumen(bobot_kata_kunci)

In [31]:
bobot_dokumen

[{'bobot_dokumen_1': 0.176},
 {'bobot_dokumen_2': 0.176},
 {'bobot_dokumen_3': 0.528}]