In [10]:
import pandas as pd
import numpy as np
import math


In [8]:

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog is fast and the cat is not.",
    "Quick brown foxes leap over lazy dogs in summer.",
    "The sun shines bright in the sky.",
    "Birds fly high in the clear blue sky."
]


In [13]:


# Step 1: Calculate term frequencies (TF)
def calculate_tf(document, term):
    term_count = document.count(term)
    return term_count / len(document.split())

# Step 2: Calculate inverse document frequencies (IDF)
def calculate_idf(documents, term):
    num_documents = len(documents)
    num_documents_with_term = sum(1 for doc in documents if term in doc)
    return np.log(num_documents / (1 + num_documents_with_term))

# Step 3: Calculate TF-IDF
def calculate_tfidf(documents, term):
    idf = calculate_idf(documents, term)
    tfs = [calculate_tf(doc, term) for doc in documents]
    return [tf * idf for tf in tfs]


In [17]:
# Step 4: Collect all unique terms
all_terms = set()
for doc in documents:
    all_terms.update(doc.lower().split())

# Step 5: Calculate TF-IDF for all terms
tfidf_matrix = []
for term in all_terms:
    tfidf_scores = calculate_tfidf(documents, term)
    tfidf_matrix.append(tfidf_scores)

# Step 6: Create Pandas DataFrame
all_terms_list = list(all_terms)
tfidf_df = pd.DataFrame(tfidf_matrix, index=all_terms_list, columns=[f"Doc{i+1}" for i in range(len(documents))])
tfidf_df


Unnamed: 0,Doc1,Doc2,Doc3,Doc4,Doc5
bright,0.0,0.0,0.0,0.130899,0.0
fast,0.0,0.10181,0.0,0.0,0.0
foxes,0.0,0.0,0.10181,0.0,0.0
blue,0.0,0.0,0.0,0.0,0.114536
brown,0.056758,0.0,0.056758,0.0,0.0
over,0.056758,0.0,0.056758,0.0,0.0
high,0.0,0.0,0.0,0.0,0.114536
lazy,0.056758,0.0,0.056758,0.0,0.0
dogs,0.0,0.0,0.10181,0.0,0.0
leap,0.0,0.0,0.10181,0.0,0.0


In [19]:
# Step 3: Calculate BM25
def calculate_bm25(documents, term, avg_doc_length, k1=1.5, b=0.75):
    idf = calculate_idf(documents, term)
    tfs = [calculate_tf(doc, term) for doc in documents]
    doc_lengths = [len(doc.split()) for doc in documents]
    bm25_scores = []
    for tf, doc_length in zip(tfs, doc_lengths):
        numerator = (k1 + 1) * tf
        denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
        bm25_scores.append(idf * (numerator / denominator))
    return bm25_scores


In [20]:

# Step 5: Calculate BM25 for all terms
bm25_matrix = []
avg_doc_length = sum(len(doc.split()) for doc in documents) / len(documents)
for term in all_terms:
    bm25_scores = calculate_bm25(documents, term, avg_doc_length)
    bm25_matrix.append(bm25_scores)

In [25]:
# Step 6: Create Pandas DataFrame
bm25_df = pd.DataFrame(bm25_matrix, index=list(all_terms), columns=[f"Doc{i+1}" for i in range(len(documents))])
bm25_df

Unnamed: 0,Doc1,Doc2,Doc3,Doc4,Doc5
bright,0.0,0.0,0.0,0.224857,0.0
fast,0.0,0.150476,0.0,0.0,0.0
foxes,0.0,0.0,0.150476,0.0,0.0
blue,0.0,0.0,0.0,0.0,0.182217
brown,0.083889,0.0,0.083889,0.0,0.0
over,0.083889,0.0,0.083889,0.0,0.0
high,0.0,0.0,0.0,0.0,0.182217
lazy,0.083889,0.0,0.083889,0.0,0.0
dogs,0.0,0.0,0.150476,0.0,0.0
leap,0.0,0.0,0.150476,0.0,0.0


In [26]:
# Step 7: Save the DataFrame to a CSV file
bm25_df.to_csv('bm25_results.csv', index=True)

In [27]:
# Save DataFrame to CSV
tfidf_df.to_csv('tfidf_results.csv', index=True)

print("TF-IDF results saved to tfidf_results.csv")

TF-IDF results saved to tfidf_results.csv


In [28]:
# Hitung cosine similarity secara manual
def calculate_cosine_similarity(tfidf_df):
    # Normalisasi TF-IDF
    tfidf_df_norm = tfidf_df.div(tfidf_df.pow(2).sum(axis=0).pow(0.5), axis=1)
    
    # Hitung cosine similarity
    cosine_sim = tfidf_df_norm.T.dot(tfidf_df_norm)
    
    return cosine_sim

# Hitung cosine similarity
cosine_sim = calculate_cosine_similarity(tfidf_df)

print("Cosine Similarity Matrix:")
print(cosine_sim)

Cosine Similarity Matrix:
          Doc1      Doc2      Doc3      Doc4      Doc5
Doc1  1.000000  0.010071  0.271183  0.000000  0.000000
Doc2  0.010071  1.000000  0.009022  0.000000  0.000000
Doc3  0.271183  0.009022  1.000000  0.027195  0.012252
Doc4  0.000000  0.000000  0.027195  1.000000  0.109052
Doc5  0.000000  0.000000  0.012252  0.109052  1.000000


In [31]:
def calculate_euclidean_distance_similarity(tfidf_df):
    # Normalisasi TF-IDF
    tfidf_df_norm = tfidf_df.div(tfidf_df.pow(2).sum(axis=0).pow(0.5), axis=1)
    
    # Hitung euclidean distance similarity
    distances = np.sqrt(np.sum((tfidf_df_norm.values[:, None] - tfidf_df_norm.values)**2, axis=-1))
    euclidean_distance_sim = 1 - (distances / distances.max())
    
    return pd.DataFrame(euclidean_distance_sim, index=tfidf_df.index, columns=tfidf_df.index)

# Hitung euclidean distance similarity
euclidean_distance_sim = calculate_euclidean_distance_similarity(tfidf_df)

print("Euclidean Distance Similarity Matrix:")
print(euclidean_distance_sim)

Euclidean Distance Similarity Matrix:
           bright      fast     foxes      blue     brown      over      high  \
bright   1.000000  0.277750  0.224201  0.189906  0.272324  0.272324  0.189906   
fast     0.277750  1.000000  0.368238  0.326568  0.428362  0.428362  0.326568   
foxes    0.224201  0.368238  1.000000  0.269431  0.626155  0.626155  0.269431   
blue     0.189906  0.326568  0.269431  1.000000  0.320752  0.320752  1.000000   
brown    0.272324  0.428362  0.626155  0.320752  1.000000  1.000000  0.320752   
over     0.272324  0.428362  0.626155  0.320752  1.000000  1.000000  0.320752   
high     0.189906  0.326568  0.269431  1.000000  0.320752  0.320752  1.000000   
lazy     0.272324  0.428362  0.626155  0.320752  1.000000  1.000000  0.320752   
dogs     0.224201  0.368238  1.000000  0.269431  0.626155  0.626155  0.269431   
leap     0.224201  0.368238  1.000000  0.269431  0.626155  0.626155  0.269431   
sky.     0.597116  0.397162  0.333942  0.587508  0.390672  0.390672  0.

In [30]:
# Hitung Euclidean distance secara manual
def calculate_euclidean_distance(tfidf_df):
    # Normalisasi TF-IDF
    tfidf_df_norm = tfidf_df.div(tfidf_df.pow(2).sum(axis=0).pow(0.5), axis=1)
    
    # Hitung Euclidean distance
    euclidean_dist = np.sqrt(2 - 2 * tfidf_df_norm.T.dot(tfidf_df_norm))
    
    return euclidean_dist

# Hitung Euclidean distance
euclidean_dist = calculate_euclidean_distance(tfidf_df)

print("Euclidean Distance Matrix:")
print(euclidean_dist)

Euclidean Distance Matrix:
          Doc1          Doc2      Doc3      Doc4          Doc5
Doc1  0.000000  1.407074e+00  1.207326  1.414214  1.414214e+00
Doc2  1.407074  1.490116e-08  1.407820  1.414214  1.414214e+00
Doc3  1.207326  1.407820e+00  0.000000  1.394852  1.405524e+00
Doc4  1.414214  1.414214e+00  1.394852       NaN  1.334877e+00
Doc5  1.414214  1.414214e+00  1.405524  1.334877  2.107342e-08


  result = func(self.values, **kwargs)
