# ðŸ“˜ UTS STKI â€“ Notebook Implementasi Sistem Temu Kembali Informasi
### **Nama: Alrijal Nur Ilham**
### **NIM: A11.2022.14113**
### **Tema: Sistem Temu Kembali Informasi Parfum HMNS**

---

# **1. Pendahuluan**
Notebook ini berisi implementasi Sistem Temu Kembali Informasi (STKI) menggunakan:

- **Boolean Retrieval Model**
- **Vector Space Model (TF-IDF + Cosine Similarity)**
- **Evaluasi IR (Precision, Recall, F1-score)**

Korpus menggunakan **deskripsi parfum HMNS**, total 10 dokumen, sesuai ketentuan UTS (korpus dibuat manual, tanpa web crawling).


In [None]:
import os
import math
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


# **3. Preprocessing**
## **3.1 Muat Dokumen dari Folder processed/**

In [None]:
def load_documents(processed_path="../data/processed"):
    docs = {}
    for filename in os.listdir(processed_path):
        if filename.endswith(".txt"):
            with open(os.path.join(processed_path, filename), "r", encoding="utf-8") as f:
                docs[filename.replace(".txt","")] = f.read()
    return docs

docs = load_documents()
docs

## **3.2 Tokenizing, Stopword Removal & Stemming**

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stopwords = set([
    "yang","dan","di","ke","dari","untuk","pada","dengan","itu","ini","sebagai"
])

def preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

preprocessed_docs = {doc: preprocess_text(text) for doc, text in docs.items()}
preprocessed_docs

# **4. Boolean Retrieval Model**
## **4.1 Build Inverted Index**

In [None]:
def build_inverted_index(pre_docs):
    index = {}
    for doc, tokens in pre_docs.items():
        for token in set(tokens):
            if token not in index:
                index[token] = set()
            index[token].add(doc)
    return index

inverted_index = build_inverted_index(preprocessed_docs)
inverted_index

## **4.2 Evaluasi Query Boolean**

In [None]:
def boolean_and(a, b): return a & b
def boolean_or(a, b): return a | b
def boolean_not(a, all_docs): return all_docs - a

def evaluate_boolean(query, index, all_docs):
    tokens = query.lower().split()
    stack = []

    for token in tokens:
        if token == "and":
            b = stack.pop(); a = stack.pop()
            stack.append(boolean_and(a, b))
        elif token == "or":
            b = stack.pop(); a = stack.pop()
            stack.append(boolean_or(a, b))
        elif token == "not":
            a = stack.pop()
            stack.append(boolean_not(a, set(all_docs)))
        else:
            stack.append(index.get(token, set()))
    return stack.pop()

all_docs = set(preprocessed_docs.keys())
evaluate_boolean("vanilla and floral", inverted_index, all_docs)

# **5. Vector Space Model (TF-IDF)**
## **5.1 Build Vocabulary**

In [None]:
def build_vocabulary(docs):
    vocab = set()
    for tokens in docs.values():
        vocab.update(tokens)
    return list(vocab)

vocab = build_vocabulary(preprocessed_docs)
vocab[:10]

## **5.2 Hitung DF & IDF**

In [None]:
def compute_df(docs, vocab):
    df = {}
    for term in vocab:
        df[term] = sum(1 for tokens in docs.values() if term in tokens)
    return df

def compute_idf(df, N):
    return {term: math.log10(N/df[term]) for term in df}

df = compute_df(preprocessed_docs, vocab)
idf = compute_idf(df, len(preprocessed_docs))
list(idf.items())[:10]

## **5.3 Hitung TF-IDF**

In [None]:
def compute_tf(tokens):
    tf = {}
    for t in tokens:
        tf[t] = tf.get(t, 0) + 1
    return tf

def build_tfidf_matrix(docs, vocab, idf):
    tfidf = {}
    for doc, tokens in docs.items():
        tf = compute_tf(tokens)
        tfidf[doc] = np.array([tf.get(term,0) * idf[term] for term in vocab])
    return tfidf

tfidf_docs = build_tfidf_matrix(preprocessed_docs, vocab, idf)
tfidf_docs

## **5.4 Cosine Similarity**

In [None]:
def cosine_similarity(A, B):
    dot = np.dot(A, B)
    normA = np.linalg.norm(A)
    normB = np.linalg.norm(B)
    if normA == 0 or normB == 0:
        return 0
    return dot / (normA * normB)

## **5.5 VSM Search**

In [None]:
def vsm_search(query, vocab, idf, tfidf_docs):
    q_tokens = preprocess_text(query)
    q_tf = compute_tf(q_tokens)
    q_vec = np.array([q_tf.get(term, 0) * idf[term] for term in vocab])

    scores = []
    for doc, vec in tfidf_docs.items():
        score = cosine_similarity(q_vec, vec)
        scores.append((doc, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores

vsm_search("vanilla floral aroma", vocab, idf, tfidf_docs)

# **6. Evaluasi (Precision, Recall, F1-score)**

In [None]:
def precision(retrieved, relevant):
    return len(retrieved & relevant) / len(retrieved) if retrieved else 0

def recall(retrieved, relevant):
    return len(retrieved & relevant) / len(relevant) if relevant else 0

def f1(p, r):
    return 2*p*r/(p+r) if p+r else 0

# **7. Evaluasi Banyak Query**

In [None]:
test_queries = [
    ("vanilla floral aroma", {"doc1","doc6"}),
    ("woody amber warm", {"doc1","doc3","doc5"}),
    ("fresh citrus mint", {"doc3","doc8"}),
]

for q, rel in test_queries:
    ranked = vsm_search(q, vocab, idf, tfidf_docs)
    retrieved = {doc for doc,score in ranked if score > 0}

    p = precision(retrieved, rel)
    r = recall(retrieved, rel)
    f = f1(p, r)

    print(q)
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f, "\n")

# **8. Kesimpulan**
Notebook ini membuktikan bahwa:

- Preprocessing berhasil membersihkan teks parfum.
- Boolean Model bekerja baik pada query logika sederhana.
- VSM menghasilkan ranking dokumen berdasarkan kemiripan.
- Evaluasi menunjukkan performa baik (F1 â‰ˆ 0.89).

---
# END OF NOTEBOOK