In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os
from sklearn.utils import shuffle

sys.path.append(os.path.abspath(os.path.join('..')))

from src.inforet import download_and_extract, load_split, batched_topk_indices, precision_at_k, predicted_sentiment


In [2]:
download_and_extract()

X_train, y_train = load_split("train")
X_test, y_test = load_split("test")

print(f"Train: {len(X_train)} docs | Test (queries): {len(X_test)} docs")

Train: 25000 docs | Test (queries): 25000 docs


In [3]:
X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [4]:
# TF-IDF index on TRAIN
vectorizer = TfidfVectorizer(stop_words="english", max_features=50)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Evaluate retrieval-by-sentiment
K = 10
topk = batched_topk_indices(X_train_tfidf, X_test_tfidf, k = K)

print(topk)

[[ 2586  5570  6341 ...  1177 20747  3629]
 [15341 14828  3518 ...  1170 17333 21044]
 [21090   125 15619 ...  8322 24278  7710]
 ...
 [ 9703 15964  6302 ... 12837 16021 10163]
 [22054 12922  1909 ...  8581 22278 14232]
 [14999  2216 15857 ...  7676 15024 18175]]


In [6]:
overall_p_at_k = precision_at_k(topk, y_train, y_test)

pos_mask = (y_test == 1)
neg_mask = (y_test == 0)

In [7]:
p_at_k_pos = precision_at_k(topk[pos_mask], y_train, y_test[pos_mask])
p_at_k_neg = precision_at_k(topk[neg_mask], y_train, y_test[neg_mask])

In [8]:
print(f"\nPrecision@{K} (label match):")
print(f" Overall: {overall_p_at_k:.4f}")
print(f" Pos queries -> Pos retrieved: {p_at_k_pos:.4f}")
print(f" Neg queries -> Neg retrieved: {p_at_k_neg:.4f}")

# Optional : show one example query + retrieved labels

qi = 0
retrieved = topk[qi]
print("\nExample query:")
print(" Query label: ", "pos" if y_test[qi] == 1 else "neg")
print(" Query text: ", X_test[qi][:200].replace("\n", " "), "...")
print(" Top retrieved labels: ", ["pos" if y_train[i] == 1 else "neg" for i in retrieved])



Precision@10 (label match):
 Overall: 0.5846
 Pos queries -> Pos retrieved: 0.5808
 Neg queries -> Neg retrieved: 0.5884

Example query:
 Query label:  pos
 Query text:  I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. ...
 Top retrieved labels:  ['pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos']


## Exercise 1:
To find the score of a review we could use:
 - a similarity score : how simmilar the new query is to current positive/negative reviews.
 - predicterd sentiments: a 0 or 1 label based on the majority/weighted average of the top_k neighbors

In [9]:
score = predicted_sentiment("", vectorizer, X_train_tfidf, y_train)
score

np.float64(0.5454545454545454)

In [10]:
debug_query = vectorizer.transform(["This movie was an absolute masterpiece"])
print(f"Non-zero elements: {debug_query.nnz}")

Non-zero elements: 1


In [11]:
# Create a vector for your query
query_vec = vectorizer.transform(["This movie was an absolute masterpiece !"])

# Get the indices manually
indices = batched_topk_indices(X_train, query_vec, k=10)

print(f"Retrieved Indices: {indices}")
print(f"Labels of neighbors: {y_train[indices[0]]}")

# Check the similarity scores
# (We manually calc dot product for these specific neighbors to see if they are 0)
sims = X_train[indices[0]].dot(query_vec.T).toarray()
print(f"Similarities:\n{sims}")

AttributeError: 'list' object has no attribute 'shape'