## Expected Steps for LSI model
- Build the TF‚ÄìIDF Matrix ùëä
- Apply Singular Value Decomposition (SVD), using the numpy decomposition
function

      U, S, VT = np.linalg.svd(W, full_matrices=False)
- Reduce Dimensionality to K=3
- Represent the queries as binary vectors (1 if term exists 0 else)
- Project them into the new latent semantic space dimensions using the formula given
in the lecture notes
- Compute similarity between the queries and all documents, using the formula given
in the lecture notes
- Rank the Documents for each query in decreasing order.

In [1]:
import numpy as np
import pandas as pd

# --------------------------------------------------
# 1Ô∏è‚É£ Read the inverted file
# Format expected: term \t doc \t freq \t tfidf
# Example line: 10%    D2    2    0.281699
# --------------------------------------------------
file_path = "results/inverted_index_weighted.txt"

# Read with tab or space separators
df = pd.read_csv(file_path, sep=r"\s+", header=None, names=["term", "doc", "freq", "tfidf"])

# --------------------------------------------------
# 2Ô∏è‚É£ Build TF‚ÄìIDF matrix W (terms √ó documents)
# --------------------------------------------------
# Get sorted unique terms and docs
terms = sorted(df["term"].unique())
docs = sorted(df["doc"].unique())

# Create empty matrix
W = pd.DataFrame(0.0, index=terms, columns=docs)

# Fill in TF-IDF values
for _, row in df.iterrows():
    W.loc[row["term"], row["doc"]] = row["tfidf"]

print("TF‚ÄìIDF Matrix W:")
print(W)

# Convert to numpy array
W_matrix = W.to_numpy()

# --------------------------------------------------
# 3Ô∏è‚É£ Apply SVD (Singular Value Decomposition)
# --------------------------------------------------
U, S, VT = np.linalg.svd(W_matrix, full_matrices=False)

# --------------------------------------------------
# 4Ô∏è‚É£ Reduce to K=3 dimensions
# --------------------------------------------------
K = 3
U_k = U[:, :K]
S_k = np.diag(S[:K])
VT_k = VT[:K, :]

# --------------------------------------------------
# 5Ô∏è‚É£ Represent queries as binary term vectors
# Example: suppose we have two example queries
# --------------------------------------------------
# q1: large language models for information retrieval and ranking
# q2: LLM for information retrieval and Ranking
# q3: query Reformulation in information retrieval
# q4: ranking Documents
# q5: Optimizing recommendation systems with LLMs by leveraging item metadata
queries = {
    "q1": ["large", "language", "models", "information", "retrieval", "ranking"],
    "q2": ["LLM", "information", "retrieval", "Ranking"],
    "q3": ["query", "Reformulation", "information", "retrieval"],
    "q4": ["ranking", "Documents"],
    "q5": ["Optimizing", "recommendation", "systems", "LLMs", "leveraging", "item", "metadata"],
}

Q = pd.DataFrame(0, index=queries.keys(), columns=terms)
for q, q_terms in queries.items():
    for t in q_terms:
        if t in Q.columns:
            Q.loc[q, t] = 1

print("\nQuery term matrix:")
print(Q)

# --------------------------------------------------
# 6Ô∏è‚É£ Project queries into latent semantic space
# Formula: q' = q * U_k * S_k^-1
# --------------------------------------------------
Q_matrix = Q.to_numpy()
S_inv = np.linalg.inv(S_k)
Q_latent = np.dot(np.dot(Q_matrix, U_k), S_inv)

# Project documents into the same space
D_latent = np.dot(S_k, VT_k).T  # shape: docs √ó K

# --------------------------------------------------
# 7Ô∏è‚É£ Compute cosine similarity between each query and document
# --------------------------------------------------
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)

similarities = {}
for i, q_name in enumerate(Q.index):
    sims = {}
    for j, doc_name in enumerate(docs):
        sims[doc_name] = cosine_similarity(Q_latent[i], D_latent[j])
    # Sort in decreasing order
    sims_sorted = dict(sorted(sims.items(), key=lambda x: x[1], reverse=True))
    similarities[q_name] = sims_sorted

# --------------------------------------------------
# 8Ô∏è‚É£ Display ranked documents for each query
# --------------------------------------------------
print("\n=== Ranked Documents ===")
for q, sims in similarities.items():
    print(f"\n{q}:")
    for doc, score in sims.items():
        print(f"  {doc}: {score:.4f}")


TF‚ÄìIDF Matrix W:
             D1        D2        D3      D4        D5        D6
1      0.000000  0.000000  0.000000  0.0000  0.000000  0.105637
10%    0.000000  0.281699  0.000000  0.0000  0.000000  0.000000
12%    0.000000  0.000000  0.000000  0.0939  0.000000  0.000000
175    0.000000  0.140850  0.000000  0.0000  0.000000  0.000000
18%    0.169020  0.000000  0.000000  0.0000  0.000000  0.000000
...         ...       ...       ...     ...       ...       ...
word   0.000000  0.000000  0.150515  0.0000  0.100343  0.000000
world  0.000000  0.000000  0.105637  0.0000  0.000000  0.000000
x      0.000000  0.140850  0.000000  0.0000  0.000000  0.000000
year   0.000000  0.000000  0.000000  0.0000  0.000000  0.105637
zero   0.507059  0.000000  0.000000  0.0000  0.000000  0.000000

[392 rows x 6 columns]

Query term matrix:
    1  10%  12%  175  18%  2  20  2019  2020  24%  ...  viewer  web  weight  \
q1  0    0    0    0    0  0   0     0     0    0  ...       0    0       0   
q2  0    0 