In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from src.inforet import download_and_extract, load_split, batched_topk_indices, precision_at_k


In [3]:
download_and_extract()

X_train, y_train = load_split("train")
X_test, y_test = load_split("test")

print(f"Train: {len(X_train)} docs | Test (queries): {len(X_test)} docs")

Train: 25000 docs | Test (queries): 25000 docs


In [4]:
# TF-IDF index on TRAIN
vectorizer = TfidfVectorizer(stop_words="english", max_features=50)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Evaluate retrieval-by-sentiment
K = 10
topk = batched_topk_indices(X_train_tfidf, X_test_tfidf, k = K)

print(topk)

[[11568  2879  5527 ... 23234 23726 10764]
 [15573 18477  4416 ...  2434 13186 19149]
 [10211  3710 14641 ...   413 21732 10747]
 ...
 [11661 12649 10275 ... 14729 15870 18551]
 [ 9203 10155 22374 ... 10650  7909  1264]
 [21417  2683 22143 ...  4680 22838 19984]]


In [6]:
overall_p_at_k = precision_at_k(topk, y_train, y_test)

pos_mask = (y_test == 1)
neg_mask = (y_test == 0)

In [7]:
p_at_k_pos = precision_at_k(topk[pos_mask], y_train, y_test[pos_mask])
p_at_k_neg = precision_at_k(topk[neg_mask], y_train, y_test[neg_mask])

In [8]:
print(f"\nPrecision@{K} (label match):")
print(f" Overall: {overall_p_at_k:.4f}")
print(f" Pos queries -> Pos retrieved: {p_at_k_pos:.4f}")
print(f" Neg queries -> Neg retrieved: {p_at_k_neg:.4f}")

# Optional : show one example query + retrieved labels

qi = 0
retrieved = topk[qi]
print("\nExample query:")
print(" Query label: ", "pos" if y_test[qi] == 1 else "neg")
print(" Query text: ", X_test[qi][:200].replace("\n", " "), "...")
print(" Top retrieved labels: ", ["pos" if y_train[i] == 1 else "neg" for i in retrieved])



Precision@10 (label match):
 Overall: 0.5844
 Pos queries -> Pos retrieved: 0.5806
 Neg queries -> Neg retrieved: 0.5881

Example query:
 Query label:  pos
 Query text:  I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. ...
 Top retrieved labels:  ['pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos']


## Exercise 1:
