<a href="https://colab.research.google.com/github/rarizzt/IR_Project/blob/main/2_Modelling_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')



df = pd.read_csv('/content/drive/MyDrive/TKI/dataset_ir_250_preprocessed_lengkap.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#1. INSTALASI & IMPORT ---
!pip install rank_bm25

import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from tqdm import tqdm

print("Library siap digunakan.")

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Library siap digunakan.


In [None]:
# --- 2. LOAD DATASET ---
path = '/content/drive/MyDrive/TKI/dataset_ir_250_preprocessed_lengkap.csv'

df = pd.read_csv(path)

df['clean_text'] = df['clean_text'].fillna('')

print("Mengonversi token kembali ke format List...")
df['tokens'] = df['tokens'].apply(ast.literal_eval)

print(f"Data berhasil dimuat: {len(df)} baris.")
print("Contoh Tokens:", df['tokens'].iloc[0][:5])

Mengonversi token kembali ke format List...
Data berhasil dimuat: 250 baris.
Contoh Tokens: ['qatar', 'airways', 'rute', 'balidoha', 'batal']


In [None]:
# --- 3. TRAINING MODEL ---


print("Melatih Model TF-IDF...")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])
print("TF-IDF Selesai.")

print("Melatih Model BM25...")
tokenized_corpus = df['tokens'].tolist()
bm25 = BM25Okapi(tokenized_corpus)
print("BM25 Selesai.")

Melatih Model TF-IDF...
TF-IDF Selesai.
Melatih Model BM25...
BM25 Selesai.


In [None]:
# --- 4. EKSEKUSI PENCARIAN (TF-IDF & BM25) ---


queries = {
    "Q1": "halal",
    "Q2": "transmart",
    "Q3": "rupiah",
    "Q4": "belanja",
    "Q5": "perang",
    "Q6": "menteri",
    "Q7": "minyak",
    "Q8": "energi",
    "Q9": "saham",
    "Q10": "tarif"
}

final_results = []

print("Mulai memproses 10 Query...")

for q_id, keyword in tqdm(queries.items()):


    query_vec = vectorizer.transform([keyword.lower()])

    tfidf_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    query_tokens = keyword.lower().split()

    bm25_scores = bm25.get_scores(query_tokens)

    for idx in range(len(df)):
        score_tfidf = tfidf_scores[idx]
        score_bm25 = bm25_scores[idx]

        if score_tfidf > 0 or score_bm25 > 0:
            final_results.append({
                'Query_ID': q_id,
                'Query_Keyword': keyword,
                'Doc_ID': idx,
                'Doc_Title': df['title'].iloc[idx],
                'TFIDF_Score': round(score_tfidf, 4),
                'BM25_Score': round(score_bm25, 4)
            })


df_final = pd.DataFrame(final_results)


df_final = df_final.sort_values(by=['Query_ID', 'BM25_Score'], ascending=[True, False])

print("\nSelesai!")
print(f"Total baris relevan yang ditemukan: {len(df_final)}")

Mulai memproses 10 Query...


100%|██████████| 10/10 [00:00<00:00, 225.02it/s]


Selesai!
Total baris relevan yang ditemukan: 199





In [None]:
# --- 5. SIMPAN KE CSV ---
output_path = '/content/drive/MyDrive/TKI/hasil_ranking_final5.csv'

df_final.to_csv(output_path, index=False)

print("="*40)
print("FILE FINAL BERHASIL DIBUAT")
print(f"Lokasi: {output_path}")
print("="*40)

print("\nPreview Data:")
print(df_final[['Query_ID', 'Query_Keyword', 'Doc_Title', 'TFIDF_Score', 'BM25_Score']].head(10))

FILE FINAL BERHASIL DIBUAT
Lokasi: /content/drive/MyDrive/TKI/hasil_ranking_final5.csv

Preview Data:
   Query_ID Query_Keyword                                          Doc_Title  \
1        Q1         halal  IIHF 2025, ESQ Halal Center Rekrut Ribuan P3H ...   
13       Q1         halal  LPPOM Dukung IIHF 2025, Wujudkan Indonesia Jad...   
6        Q1         halal  BPJPH Buka 10 Ribu Lowongan Jadi Pendamping Se...   
2        Q1         halal  ID Survey Edukasi Pengunjung soal Ruang Lingku...   
0        Q1         halal  Alfamart di IIHF 2025, Dukung Visi Halal Globa...   
4        Q1         halal  BPJPH Tengah Susun Aturan Baru Mudahkan Sertif...   
5        Q1         halal  BPJPH Sebut Banyak Pelaku Usaha Tak Tertib Ser...   
8        Q1         halal  Bos BPJP: Ayam Goreng Widuran Belum Pernah Daf...   
9        Q1         halal  Bos BPJPH Wanti-wanti Label 'No Pork No Lard' ...   
10       Q1         halal  Babe Haikal Prediksi Ekspor Produk Halal RI Te...   

    TFIDF_Score  

**EVALUASI**

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load Data
path = '/content/drive/MyDrive/TKI/dataset_ir_250_preprocessed_lengkap.csv'
df = pd.read_csv(path)
df['clean_text'] = df['clean_text'].fillna('')

# 2. Latih TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])

# 3. Jalankan Pencarian "Transmart"
query = "transmart"
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

# 4. Ambil Top 5
top_indices = scores.argsort()[::-1][:5]

results = []
for rank, idx in enumerate(top_indices):
    if scores[idx] > 0:
        results.append({
            'Peringkat': rank + 1,
            'ID Dokumen': idx,
            'Judul Berita': df['title'].iloc[idx],
            'Skor TF-IDF': round(scores[idx], 4)
        })

# 5. Tampilkan Tabel
df_table_42 = pd.DataFrame(results)
print(df_table_42.to_string(index=False))

 Peringkat  ID Dokumen                                                          Judul Berita  Skor TF-IDF
         1          30    Jangan Lupa ke Transmart Full Day Sale Besok, Ada Diskon 50% + 20%       0.4156
         2         110 Buah Segar Diskon 20% di Transmart Full Day Sale, Jeruk Wokam Rp2.792       0.4007
         3         111   Diskon Besar Transmart Kota Kasablanka, Sepeda Listrik Jadi Incaran       0.3942
         4         201       Belanja Baju Anak di Transmart Full Day Sale, Diskon hingga 70%       0.3857
         5         204     Sepeda Diskon Lagi di Transmart Full Day Sale, Jadi Cuma Rp1 Juta       0.3773


In [7]:
import pandas as pd
import ast
from rank_bm25 import BM25Okapi

# 1. Load Data
path = '/content/drive/MyDrive/TKI/dataset_ir_250_preprocessed_lengkap.csv'
df = pd.read_csv(path)

# Pastikan token berbentuk list (bukan string)
try:
    df['tokens'] = df['tokens'].apply(ast.literal_eval)
except:
    pass # Jika sudah list, abaikan

# 2. Latih BM25
tokenized_corpus = df['tokens'].tolist()
bm25 = BM25Okapi(tokenized_corpus)

# 3. Jalankan Pencarian "Transmart"
query = "transmart"
query_tokens = query.split()
scores = bm25.get_scores(query_tokens)

# 4. Ambil Top 5
top_indices = scores.argsort()[::-1][:5]

results = []
for rank, idx in enumerate(top_indices):
    if scores[idx] > 0:
        results.append({
            'Peringkat': rank + 1,
            'ID Dokumen': idx,
            'Judul Berita': df['title'].iloc[idx],
            'Skor BM25': round(scores[idx], 4)
        })

# 5. Tampilkan Tabel
df_table_43 = pd.DataFrame(results)
print(df_table_43.to_string(index=False))

 Peringkat  ID Dokumen                                                          Judul Berita  Skor BM25
         1          30    Jangan Lupa ke Transmart Full Day Sale Besok, Ada Diskon 50% + 20%     3.6391
         2         110 Buah Segar Diskon 20% di Transmart Full Day Sale, Jeruk Wokam Rp2.792     3.4793
         3         201       Belanja Baju Anak di Transmart Full Day Sale, Diskon hingga 70%     3.4485
         4         111   Diskon Besar Transmart Kota Kasablanka, Sepeda Listrik Jadi Incaran     3.3885
         5         169      Jeruk, Apel, hingga Anggur Diskon 20% di Transmart Full Day Sale     3.3885


In [8]:
import pandas as pd
import numpy as np

# A. DAFTAR QUERY
queries = {
    "Q1": "halal",
    "Q2": "transmart",
    "Q3": "rupiah",
    "Q4": "belanja",
    "Q5": "perang",
    "Q6": "menteri",
    "Q7": "minyak",
    "Q8": "energi",
    "Q9": "saham",
    "Q10": "tarif"
}

# B. GROUND TRUTH MANUAL
manual_ground_truth = {
    "Q1": [1, 28, 31, 33, 38, 39, 40, 46, 49, 51, 55, 84, 105, 109], # HALAL
    "Q2": [13, 17, 30, 32, 35, 37, 41, 43, 110, 111, 113, 116, 166, 167, 169, 170, 171, 190, 194, 195, 197, 198, 200, 201, 202, 204, 247, 248, 249],   #TRANSMART
    "Q3": [13, 41, 113, 167, 170, 194, 197, 249],#RUPIAH
    "Q4": [81, 85, 87, 120, 124, 198, 201],#BELANJA
    "Q5": [3, 5, 8, 87, 95, 234],#PERANG
    "Q6": [2, 3, 8, 10, 15, 21, 22, 23, 24, 27, 36, 60, 62, 68, 85, 91, 94, 100, 104, 108, 113, 121, 122, 123, 125, 131, 133, 134, 137, 141, 143, 144, 160, 183, 184, 196, 214, 217, 221, 222, 225, 227, 229, 230, 232, 233, 236, 243, 245],#MENTERI
    "Q7": [7, 10, 21, 22, 23, 87, 112, 132, 144, 156],#MINYAK
    "Q8": [2, 8, 10, 11, 16, 21, 22, 23, 50, 57, 64, 86, 126, 144, 145, 160, 196, 217, 221, 235],#ENERGI
    "Q9": [26, 47, 64, 70, 168, 182, 186, 188, 246], #SAHAM
    "Q10": [18, 65, 72, 119, 210, 211, 215, 228, 234] #TARIF
}

# C. HITUNG EVALUASI (OTOMATIS)
metrics_data = []

def get_ap_precision_recall(model_type, query_text, relevant_ids, k=5):
    if not relevant_ids: return 0, 0, 0

    if model_type == 'TF-IDF':
        q_vec = vectorizer.transform([query_text])
        scores = cosine_similarity(q_vec, tfidf_matrix).flatten()
    else:
        scores = bm25.get_scores(query_text.split())

    sorted_indices = scores.argsort()[::-1]

    # Hitung Precision@K & Recall@K
    top_k = sorted_indices[:k]
    hits = len(set(top_k) & set(relevant_ids))
    precision = hits / k
    recall = hits / len(relevant_ids)

    # Hitung AP (Average Precision) untuk MAP
    ap_score = 0
    hits_ap = 0
    for i, idx in enumerate(sorted_indices):
        if idx in relevant_ids:
            hits_ap += 1
            ap_score += hits_ap / (i + 1)
    ap = ap_score / len(relevant_ids) if len(relevant_ids) > 0 else 0

    return precision, recall, ap

# Loop hitung rata-rata
tfidf_p, tfidf_r, tfidf_ap = [], [], []
bm25_p, bm25_r, bm25_ap = [], [], []

for q_id, keyword in queries.items():
    rel = manual_ground_truth[q_id]

    # TF-IDF
    p, r, ap = get_ap_precision_recall('TF-IDF', keyword, rel, k=5)
    tfidf_p.append(p); tfidf_r.append(r); tfidf_ap.append(ap)

    # BM25
    p, r, ap = get_ap_precision_recall('BM25', keyword, rel, k=5)
    bm25_p.append(p); bm25_r.append(r); bm25_ap.append(ap)

data_final = {
    'Metrik': ['Rata-rata Precision@5', 'Rata-rata Recall@5', 'Mean Average Precision (MAP)'],
    'Model TF-IDF': [np.mean(tfidf_p), np.mean(tfidf_r), np.mean(tfidf_ap)],
    'Model BM25': [np.mean(bm25_p), np.mean(bm25_r), np.mean(bm25_ap)]
}

df_table_44 = pd.DataFrame(data_final)
print(df_table_44.round(4).to_string(index=False))

                      Metrik  Model TF-IDF  Model BM25
       Rata-rata Precision@5        0.9600      0.9600
          Rata-rata Recall@5        0.4625      0.4625
Mean Average Precision (MAP)        0.9559      0.9530
