In [7]:
import os
import spacy
from spacy.lang.id.stop_words import STOP_WORDS
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import defaultdict

def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id in doc_dict.keys():
        tf_docs[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return tf_docs

preprocessed_docs = {}

# Inisialisasi model bahasa Spacy untuk Bahasa Indonesia
nlp = spacy.blank("id")

# Inisialisasi Stemmer dari Sastrawi
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

path = "C:/Users/Afi/Downloads/berita~/berita"

def read_text_file(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    return content

def tokenize_and_remove_stopwords(text):
    # Tokenisasi dengan Spacy
    doc = nlp(text)
    tokens = [token.text for token in doc]
    
    # Eliminasi stopword
    tokens = [token for token in tokens if token not in STOP_WORDS]
    
    return tokens

def preprocess_text(text, stem=True, lowercase=False):
    # Tokenisasi dan eliminasi stopword
    tokens = tokenize_and_remove_stopwords(text)
    
    # Stemming jika diperlukan
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]
    
    # Case folding
    if lowercase:
        text = ' '.join(tokens).lower()
    else:
        text = ' '.join(tokens).upper()
    
    return text

# Inisialisasi inverted index
inverted_index = {}

# Loop untuk membaca dan memproses setiap dokumen
for i, file in enumerate(os.listdir(path)):
    if file.endswith(".txt"):
        file_path = os.path.join(path, file)
        content = read_text_file(file_path)
        preprocessed_text = preprocess_text(content)
        preprocessed_docs[i + 1] = preprocessed_text
        
        # Tokenisasi teks yang telah diproses
        tokens = preprocessed_text.split()
        
        # Membuat inverted index
        for term in tokens:
            if term not in inverted_index:
                inverted_index[term] = []
            if (term in inverted_index) and ((i+1) not in inverted_index[term]):
                inverted_index[term].append(i+1)
                


vocab=list(inverted_index.keys())
tf_docs = termFrequencyInDoc(vocab, preprocessed_docs)

for doc_id, tf in tf_docs.items():
    print(f"Document {doc_id} term frequencies:")
    for term, frequency in tf.items():
        print(f"{term}: {frequency}")

Document 1 term frequencies:
WILAYAH: 2
KAMU: 2
SUDAH: 2
BEBAS: 2
COVID: 3
-: 16
19: 3
CEK: 2
34: 2
KAB: 2
KOTA: 2
ZONA: 2
HIJAU: 2
BARU: 2
JAKARTA: 1
PERINTAH: 1
RENCANA: 1
TERAP: 2
LAKU: 1
BATAS: 1
GIAT: 1
MASYARAKAT: 1
PPKM: 3
LEVEL: 3
3: 5
HITUNG: 1
24: 1
DESEMBER: 1
2021: 2
2: 7
JANUARI: 1
NAMUN: 1
MENTERI: 1
SEHAT: 2
RI: 6
BIJAK: 1
TAHAP: 1
KAJI: 1
TURUT: 1
DIREKTUR: 1
CEGAH: 1
KENDALI: 1
SAKIT: 1
TULAR: 1
LANGSUNG: 1
P2PML: 1
KEMENKES: 1
DR: 1
SITI: 1
NADIA: 1
TARMIZI: 1
SIGNIFIKAN: 1
HAL: 1
PICU: 1
TINGKAT: 1
MOBILITAS: 1
LONGGAR: 1
PROTOKOL: 1
HTTPS: 1
HEALTH: 2
DETIK: 2
COM: 1
BERITA-DETIKHEALTH: 1
D-5816690: 1
WILAYAH-KAMU-SUDAH-BEBAS-COVID-19-CEK-34-KABKOTA-ZONA-HIJAU-TERBARU: 1
VAKSIN: 0
BAKAL: 0
RUTIN: 0
TIAP: 0
TAHUN: 0
GANTUNG: 0
INI: 0
JELAS: 0
BERI: 1
BOOSTER: 0
DOSIS: 0
TIGA: 0
INDONESIA: 0
2022: 0
LANTAS: 0
ADA: 0
VAKSINASI: 0
INFLUENZA: 0
KETUA: 0
SATGAS: 0
IKAT: 0
DOKTER: 0
IDI: 0
PROF: 0
ZUBAIRI: 0
DJOERBAN: 0
PASTI: 0
KAIT: 0
D-5816582: 0
VAKSIN-COVID-19-BAKAL-R

In [12]:
#NO 1
def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
            if word in tokenisasi(doc):
                frq = frq + 1
        df[word] = frq
    return df

def tokenisasi(text):
    # Memisahkan teks menjadi kata-kata berdasarkan spasi
    tokens = text.split()
    return tokens

import numpy as np
def inverseDocFre(vocab,doc_fre,length):
    idf= {}
    for word in vocab:
        idf[word] = idf[word] = 1 + np.log((length + 1) / (doc_fre[word]+1))
    return idf

def tfidf(vocab,tf,idf_scr,doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, preprocessed_docs), inverseDocFre(vocab, wordDocFre(vocab, preprocessed_docs), len(preprocessed_docs)), preprocessed_docs)

# Term - Document Matrix
TD = np.zeros((len(vocab), len(preprocessed_docs)))
for word in vocab:
    for doc_id,doc in tf_idf.items():
        ind1 = vocab.index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]
print(TD)

[[ 3.38629436  0.          0.          0.          1.69314718]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 3.          7.          4.          2.          3.        ]
 [16.         18.         20.         12.         16.        ]
 [ 3.          7.          4.          2.          3.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          2.09861229  0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 4.19722458  0.          0.          0.          0.        ]
 [ 2.81093022  0.          1.40546511  1.40546511  0.        ]
 [ 1.          1.          1.          4.          1.        ]
 [ 2.09861229  0.          0.          0.          0.  

In [2]:
#NO 2
def edit_distance(string1, string2):
    if len(string1) > len(string2):
        difference = len(string1) - len(string2)
        string1[:difference]
        n = len(string2)
    elif len(string2) > len(string1):
        difference = len(string2) - len(string1)
        string2[:difference]
        n = len(string1)
    for i in range(n):
        if string1[i] != string2[i]:
            difference += 1
    return difference

def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def euclidian_dist(vec1, vec2):
    # subtracting vector
    temp = vec1 - vec2
    # doing dot product
    # for finding
    # sum of the squares
    sum_sq = np.dot(temp.T, temp)
    # Doing squareroot and
    # printing Euclidean distance
    return np.sqrt(sum_sq)

import math
def cosine_sim(vec1, vec2):
    vec1 = list(vec1)
    vec2 = list(vec2)
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

import os
import nltk
nltk.download('punkt')
import numpy as np
from nltk.metrics import edit_distance
from scipy.spatial.distance import jaccard, euclidean
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math

def preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    tokens = nltk.word_tokenize(content)
    return ' '.join(tokens)

def calculate_similarity_matrix(documents, similarity_metric):
    if similarity_metric == 'Edit Distance':
        similarity_matrix = np.zeros((len(documents), len(documents)))
        for i in range(len(documents)):
            for j in range(i + 1, len(documents)):
                similarity_matrix[i][j] = edit_distance(documents[i], documents[j])
                similarity_matrix[j][i] = similarity_matrix[i][j]
    elif similarity_metric == 'Jaccard Similarity':
        similarity_matrix = np.zeros((len(documents), len(documents)))
        for i in range(len(documents)):
            for j in range(i + 1, len(documents)):
                set1 = set(documents[i].split())
                set2 = set(documents[j].split())
                intersection = len(set1.intersection(set2))
                union = len(set1.union(set2))
                similarity_matrix[i][j] = intersection / union
                similarity_matrix[j][i] = similarity_matrix[i][j]
    elif similarity_metric == 'Euclidean Distance':
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
        similarity_matrix = np.zeros((len(documents), len(documents)))
        for i in range(len(documents)):
            for j in range(i + 1, len(documents)):
                similarity_matrix[i][j] = euclidean(tfidf_matrix[i].toarray(), tfidf_matrix[j].toarray())
                similarity_matrix[j][i] = similarity_matrix[i][j]
    elif similarity_metric == 'Cosine Similarity':
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
        similarity_matrix = cosine_similarity(tfidf_matrix)
    else:
        similarity_matrix = None
    return similarity_matrix

folder_path = "C:/Users/Afi/Downloads/berita~/berita"
file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]

documents = [preprocess_text(file_path) for file_path in file_paths]

while True:
    print("\nBandingkan similarity matrix dengan:")
    print("1. Edit Distance")
    print("2. Jaccard Similarity")
    print("3. Euclidean Distance")
    print("4. Cosine Similarity")
    print("5. Keluar dari program")
    
    choice = input("Nomor menu yang ingin anda lakukan: ")
    
    if choice == '1':
        similarity_metric = 'Edit Distance'
    elif choice == '2':
        similarity_metric = 'Jaccard Similarity'
    elif choice == '3':
        similarity_metric = 'Euclidean Distance'
    elif choice == '4':
        similarity_metric = 'Cosine Similarity'
    elif choice == '5':
        print("Keluar dari program.")
        break
    else:
        print("Pilihan tidak valid. Silakan masukkan nomor yang sesuai.")
        continue
    
    similarity_matrix = calculate_similarity_matrix(documents, similarity_metric)
    
    if similarity_matrix is not None:
        print(f"Similarity Matrix using {similarity_metric}:")
        print(similarity_matrix)
   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Afi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.



Bandingkan similarity matrix dengan:
1. Edit Distance
2. Jaccard Similarity
3. Euclidean Distance
4. Cosine Similarity
5. Keluar dari program
Nomor menu yang ingin anda lakukan: 1
Similarity Matrix using Edit Distance:
[[  0. 541. 616. 483. 523.]
 [541.   0. 554. 519. 525.]
 [616. 554.   0. 606. 604.]
 [483. 519. 606.   0. 486.]
 [523. 525. 604. 486.   0.]]

Bandingkan similarity matrix dengan:
1. Edit Distance
2. Jaccard Similarity
3. Euclidean Distance
4. Cosine Similarity
5. Keluar dari program
Nomor menu yang ingin anda lakukan: 2
Similarity Matrix using Jaccard Similarity:
[[0.         0.10447761 0.10666667 0.1171875  0.11267606]
 [0.10447761 0.         0.248      0.10833333 0.0962963 ]
 [0.10666667 0.248      0.         0.15267176 0.12162162]
 [0.1171875  0.10833333 0.15267176 0.         0.13492063]
 [0.11267606 0.0962963  0.12162162 0.13492063 0.        ]]

Bandingkan similarity matrix dengan:
1. Edit Distance
2. Jaccard Similarity
3. Euclidean Distance
4. Cosine Similarity
5. 