# Praktikum 4

Link to these material : https://github.com/peermohtaram/Vector-Space-Model/blob/master/Vector_Space_Model.ipynb

### Raw Term Frequency

In [24]:
def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id in doc_dict.keys():
        tf_docs[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return tf_docs

In [25]:
doc1_term = ["pengembangan", "sistem", "informasi", "penjadwalan"]
doc2_term = ["pengembangan", "model", "analisis", "sentimen", "berita"]
doc3_term = ["analisis", "sistem", "input", "output"]
corpus_term = [doc1_term, doc2_term, doc3_term]

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

inverted_index = {}
for i in range(len(corpus_term)):
    for item in corpus_term[i]:
        item = stemmer.stem(item)
        if item not in inverted_index:
            inverted_index[item] = []
        if (item in inverted_index) and ((i+1) not in inverted_index[item]):
            inverted_index[item].append(i+1)

print("--- Inverted Index ---")
print(inverted_index)
print("\n")

vocab = list(inverted_index.keys())
doc_dict = {}
#clean after stemming
doc_dict['doc1'] = "kembang sistem informasi jadwal"
doc_dict['doc2'] = "kembang model analisis sentimen berita"
doc_dict['doc3'] = "analisis sistem input output"

print("--- Term Frequency -- ")
print(termFrequencyInDoc(vocab, doc_dict))

--- Inverted Index ---
{'kembang': [1, 2], 'sistem': [1, 3], 'informasi': [1], 'jadwal': [1], 'model': [2], 'analisis': [2, 3], 'sentimen': [2], 'berita': [2], 'input': [3], 'output': [3]}


--- Term Frequency -- 
{'doc1': {'kembang': 1, 'sistem': 1, 'informasi': 1, 'jadwal': 1, 'model': 0, 'analisis': 0, 'sentimen': 0, 'berita': 0, 'input': 0, 'output': 0}, 'doc2': {'kembang': 1, 'sistem': 0, 'informasi': 0, 'jadwal': 0, 'model': 1, 'analisis': 1, 'sentimen': 1, 'berita': 1, 'input': 0, 'output': 0}, 'doc3': {'kembang': 0, 'sistem': 1, 'informasi': 0, 'jadwal': 0, 'model': 0, 'analisis': 1, 'sentimen': 0, 'berita': 0, 'input': 1, 'output': 1}}


Penjelasan kode di atas : 

Inverted index untuk list apa saja kata yang mau dicari di suatu document

Nah, term frequencynya itu biar kita cari setiap kata yang ada di inverted index di suatu document tuh muncul berapa kalii. Gitu bro.

### Document Frequency

In [26]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
          #if word in doc.lower().split():
            if word in word_tokenize(doc.lower().strip()):
                frq = frq + 1
        df[word] = frq
    return df

### Inverse Document Frequency

Formula : IDF = log (N/Df)

In [27]:
import numpy as np
def inverseDocFre(vocab,doc_fre,length):
    idf= {}
    for word in vocab:
        idf[word] = idf[word] = 1 + np.log((length + 1) / (doc_fre[word]+1))
    return idf

In [28]:
print(inverseDocFre(vocab, wordDocFre(vocab, doc_dict),len(doc_dict)))

{'kembang': 1.2876820724517808, 'sistem': 1.2876820724517808, 'informasi': 1.6931471805599454, 'jadwal': 1.6931471805599454, 'model': 1.6931471805599454, 'analisis': 1.2876820724517808, 'sentimen': 1.6931471805599454, 'berita': 1.6931471805599454, 'input': 1.6931471805599454, 'output': 1.6931471805599454}


### Vector Space Model

Function dibawah ini menghasilkan w = TF*IDF

In [29]:
def tfidf(vocab,tf,idf_scr,doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

Term-Document Matrix
<br>
    doc1  doc2   doc3
<br>
t1 |w11   w12    w13|
<br>
t2 |w21   w22    w23|
<br>
t3 |w31   w32    w33|

In [30]:
tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, doc_dict), inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)), doc_dict)
# Term - Document Matrix
TD = np.zeros((len(vocab), len(doc_dict)))
for word in vocab:
    for doc_id,doc in tf_idf.items():
        ind1 = vocab.index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]
print(TD)

[[1.28768207 1.28768207 0.        ]
 [1.28768207 0.         1.28768207]
 [1.69314718 0.         0.        ]
 [1.69314718 0.         0.        ]
 [0.         1.69314718 0.        ]
 [0.         1.28768207 1.28768207]
 [0.         1.69314718 0.        ]
 [0.         1.69314718 0.        ]
 [0.         0.         1.69314718]
 [0.         0.         1.69314718]]


### Text Similarity

#### Edit Distace

Ref : https://www.w3resource.com/python-exercises/challenges/1/python-challenges-1-exercise-52.php

In [31]:
def edit_distance(string1, string2):
    if len(string1) > len(string2):
        difference = len(string1) - len(string2)
        string1[:difference]
        n = len(string2)
    elif len(string2) > len(string1):
        difference = len(string2) - len(string1)
        string2[:difference]
        n = len(string1)
    for i in range(n):
        if string1[i] != string2[i]:
            difference += 1
            
    return difference


In [32]:
print(edit_distance(doc_dict['doc1'], doc_dict['doc2']))
print(edit_distance(doc_dict['doc1'], doc_dict['doc3']))

30
31


#### Jaccard Similarity

Ref : https://www.w3resource.com/python-exercises/extended-data-types/python-extended-data-types-index-counter-exercise-9.php

In [33]:
def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    
    return float(intersection) / union

In [34]:
print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc2'].split(" ")))
print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc3'].split(" ")))

0.125
0.14285714285714285


### Euclidian Distance

In [35]:
def euclidian_dist(vec1, vec2):
    # subtracting vector
    temp = vec1 - vec2
    # doing dot product
    # for finding
    # sum of the squares
    sum_sq = np.dot(temp.T, temp)
    # Doing squareroot and
    # printing Euclidean distance
    
    return np.sqrt(sum_sq)

In [36]:
print(euclidian_dist(TD[:, 0], TD[:, 1])) #doc1 & doc2
print(euclidian_dist(TD[:, 0], TD[:, 2])) #doc1 & doc3

4.201188773980275
3.844897884155026


### Cosine Similarity

Ref : https://algoritmaonline.com/kemiripan-teks/

In [37]:
import math
def cosine_sim(vec1, vec2):
    vec1 = list(vec1)
    vec2 = list(vec2)
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)

In [38]:
print(cosine_sim(TD[:, 0], TD[:, 1])) #doc1 & doc2
print(cosine_sim(TD[:, 0], TD[:, 2])) #doc1 & doc3

0.15967058203849993
0.1832234081332565


### Penugasan

##### 1. Buat vector space model dengan menggunakan sekumpulan dokumen pada folder ”berita”

In [46]:
import os
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize

# Inisialisasi Stemmer dan Stopword Remover
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

# Inisialisasi inverted index dan dokumen yang sudah ditemukan
inverted_index = {}
documents_found = set()
doc_dict = {}  # Dictionary to store the contents of documents
vocab = set()  # Vocabulary set

# Path ke folder "berita"
path = "C:/Users/FEZA/My Drive/00. Drive PC/1.STIS/5. Semester 5/Information Retrieval [IR] P/Pertemuan 3/berita"

# Iterasi ke semua file di folder berita
for file in os.listdir(path):
    if os.path.isfile(os.path.join(path, file)):
        with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
            content = f.read().lower()  # Case folding

            # Tokenization
            words = content.split(" ")

            # Menghilangkan stopword dan dilakukan stemming
            filtered_words = [stemmer.stem(stopword_remover.remove(word)) for word in words]

            # Membangun inverted index
            for term in filtered_words:
                if term not in inverted_index:
                    inverted_index[term] = [file]
                elif file not in inverted_index[term]:
                    inverted_index[term].append(file)

            # Store the document content in doc_dict
            doc_dict[file] = " ".join(filtered_words)

            # Update the vocabulary set
            vocab.update(filtered_words)
# print(doc_dict)

print(termFrequencyInDoc(vocab, doc_dict))  #ini gara2 spasinya ke token
idf_scr = inverseDocFre(vocab,wordDocFre(vocab, doc_dict), len(doc_dict))
tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, doc_dict), idf_scr, doc_dict)

# Term - Document Matrix
TD = np.zeros((len(vocab), len(doc_dict)))
for word in vocab:
    for doc_id, doc in tf_idf.items():
        ind1 = list(vocab).index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]

# Now, TD contains the Term-Document Matrix


{'berita1.txt': {'': 571, 'batas': 1, 'data': 0, 'januari': 1, 'nasional': 0, 'pasti': 1, '34': 2, 'kasus dalam': 0, 'akhir': 0, 'shellclassinfo iconresource c program': 0, 'rencana': 1, 'banyak': 0, 'lawan': 0, 'jadi': 0, 'kaji turut': 1, 'signifikan': 1, 'vaksinasi': 0, 'https health detik com berita-detikhealth d-5812940 alert-kasus-varian-delta-covid-19-di-dki-meningkat': 0, 'dr': 1, 'catat': 0, 'masih': 0, 'perlu': 0, 'ketua': 0, 'ppkm': 3, 'sebut': 0, 'kendali': 1, '2022': 0, 'puncak': 0, 'namun': 1, 'umum': 1, 'turun': 0, 'nasihat': 0, 'hitung': 1, 'bakal': 2, '2': 7, 'awal': 0, 'beri': 1, 'jauh': 0, 'cegah': 1, 'cs jakarta': 0, 'dadak https health detik com berita-detikhealth d-5813949 corona-di-as-mendadak-naik-lagi-usai-serangan-delta-sempat-mereda': 0, 'bebas': 2, 'gantung': 0, 'tahap': 1, 'pasien': 0, 'kamu': 2, 'ingat': 0, 'naik': 1, 'area': 0, 'asal': 0, 'delta': 0, 'tular': 1, '1 327': 0, '-': 16, '90': 1, 'sekali': 0, 'per': 1, 'reda jakarta': 0, '1': 6, 'dosis': 0, 'ba

In [48]:
print(TD)

[[1682.11469511 1699.790156   2035.623913   1608.46694138 1693.89833571
   291.64510476]
 [   2.25276297    0.            0.            0.            0.
     0.        ]
 [   0.            0.            0.            2.25276297    0.
     0.        ]
 ...
 [   2.25276297    0.            0.            0.            0.
     0.        ]
 [   0.            0.            0.            0.            2.25276297
     0.        ]
 [   0.            0.            0.            2.25276297    0.
     0.        ]]


#### 2. Dari 5 file pada folder ”berita”, hitung skor kemiripan antara berita yang satu dan lainnya masing-masing dengan edit distance, jaccard similarity, euclidian distance, dan cosine similarity