In [69]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
import os
from pathlib import Path
from typing import Tuple, Dict
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [71]:
def load_data_and_create_matrices(  train_file: Path,
                                    test_file: Path ) -> Tuple[sp.csr_matrix, sp.csr_matrix, Dict[int, int], Dict[int, int]]:

    raw_data = []

    def parse_file(filepath: Path, dataset_type: str) -> None:
        if not filepath.exists():
            raise FileNotFoundError(f"File not found: {filepath}")

        with filepath.open("r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue

                u_id = int(parts[0])
                items = [int(i) for i in parts[1:]]

                for i_id in items:
                    raw_data.append(
                        {
                            "user": u_id,
                            "item": i_id,
                            "type": dataset_type,  # "train" or "test"
                        }
                    )

    parse_file(train_file, "train")
    parse_file(test_file, "test")

    df = pd.DataFrame(raw_data)
    print(f"Loaded {len(df):,} total interactions.")

    df["user_idx"] = df["user"].astype("category").cat.codes
    df["item_idx"] = df["item"].astype("category").cat.codes

    # Internal index -> original ID
    user_map: Dict[int, int] = dict(zip(df["user_idx"], df["user"]))
    item_map: Dict[int, int] = dict(zip(df["item_idx"], df["item"]))

    n_users = len(user_map)
    n_items = len(item_map)

    print(f"Matrix dimensions: {n_users:,} users x {n_items:,} items")

    def build_csr(dataset_type: str) -> sp.csr_matrix:
        subset = df[df["type"] == dataset_type]

        rows = subset["user_idx"].values
        cols = subset["item_idx"].values
        data = np.ones(len(subset), dtype=np.float32)

        return sp.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

    train_matrix = build_csr("train")
    test_matrix = build_csr("test")

    print(f"Train nnz: {train_matrix.nnz:,} | Test nnz: {test_matrix.nnz:,}")

    return train_matrix, test_matrix, user_map, item_map

In [73]:
class ItemKNNRecommender:
    """
    Item-Based Collaborative Filtering.
    Logic: "Users who liked Item A also liked Item B."
    """
    def __init__(self, n_neighbors=20):
        self.n_neighbors = n_neighbors
        self.model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
        self.train_matrix = None
        self.item_vectors = None

    def fit(self, train_matrix):
        """
        Trains the KNN model.
        We want to calculate distance between ITEMS (rows).
        """
        print(f"Training Item-KNN (k={self.n_neighbors})...")
        self.train_matrix = train_matrix
        
        self.item_vectors = train_matrix.T
        
        # Fit the model on the Item Vectors
        self.model.fit(self.item_vectors)

    def predict(self, user_idx, top_k=20):
        """
        Predicts items based on the user's history.
        """
        # 1. Get User's History
        user_history_indices = self.train_matrix[user_idx].indices
        
        if len(user_history_indices) == 0:
            return np.array([])

        query_indices = user_history_indices

        query_vectors = self.item_vectors[query_indices]
        distances, neighbor_indices = self.model.kneighbors(query_vectors)

        candidate_scores = {}
        for i in range(len(query_indices)):
            for j in range(self.n_neighbors):
                neighbor_idx = neighbor_indices[i, j]
                similarity = 1.0 - distances[i, j]
                
                candidate_scores[neighbor_idx] = candidate_scores.get(neighbor_idx, 0) + similarity

        for seen_idx in user_history_indices:
            if seen_idx in candidate_scores:
                del candidate_scores[seen_idx]

        sorted_candidates = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
        top_indices = [idx for idx, score in sorted_candidates[:top_k]]
        
        return np.array(top_indices)

In [75]:
def ndcg_at_k(predicted_indices, true_indices, k=20):
    if len(true_indices) == 0:
        return 0.0
        
    # Slice to Top K
    top_k_preds = predicted_indices[:k]
    
    relevance = np.isin(top_k_preds, true_indices).astype(float)
    
    discounts = np.log2(np.arange(2, len(relevance) + 2))
    dcg = np.sum(relevance / discounts)
    
    # Ideal DCG
    ideal_relevance = np.ones(min(len(true_indices), k))
    ideal_discounts = np.log2(np.arange(2, len(ideal_relevance) + 2))
    idcg = np.sum(ideal_relevance / ideal_discounts)
    
    return dcg / idcg if idcg > 0 else 0.0

In [41]:
def knn_pipeline(train_matrix, test_matrix, user_map, item_map, top_k=20, n_neighbors=20):
    """
    Orchestrates the training and evaluation of the Item-KNN model.
    Returns: output_df, avg_ndcg_score
    """

    model = ItemKNNRecommender(n_neighbors=n_neighbors)
    model.fit(train_matrix)

    ndcg_scores = []
    output_rows = []

    test_users = np.unique(test_matrix.nonzero()[0])
    n_test_users = len(test_users)

    print(f"Evaluating {n_test_users:,} users")

    for i, user_idx in tqdm(enumerate(test_users, 1), total=n_test_users, desc="Predicting"):
        
        top_indices = model.predict(user_idx, top_k=top_k)
        
        true_items = test_matrix[user_idx].indices
        score = ndcg_at_k(top_indices, true_items, k=top_k)
        ndcg_scores.append(score)
        
        real_user_id = user_map.get(user_idx, f"User_{user_idx}")
        real_item_ids = [item_map.get(idx, f"Item_{idx}") for idx in top_indices]
        
        output_rows.append({
            'user_id': real_user_id,
            'recommended_items': real_item_ids
        })
        
        if i % 5000 == 0:
            current_avg = np.mean(ndcg_scores)
            tqdm.write(f"Processed {i} users. Current Avg NDCG: {current_avg:.4f}")

    avg_ndcg = np.mean(ndcg_scores)
    
    print(f"RESULTS:")
    print("="*40)
    print(f"Evaluated Users: {len(ndcg_scores):,}")
    print(f"Average NDCG@{top_k}: {avg_ndcg:.4f}")
    
    return pd.DataFrame(output_rows), avg_ndcg


In [43]:
K_VAL = 5

INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = INPUT_DIR / f"data_k{K_VAL}_train.txt"
TEST_PATH = INPUT_DIR / f"data_k{K_VAL}_test.txt"

train_matrix, test_matrix, user_map, item_map = load_data_and_create_matrices(TRAIN_PATH, TEST_PATH)

Loaded 2,372,615 total interactions.
Matrix dimensions: 52,642 users x 88,416 items
Train nnz: 1,918,235 | Test nnz: 454,380


In [49]:
output_df, final_score = knn_pipeline(train_matrix, test_matrix, user_map, item_map, top_k=20, n_neighbors=20)

Training Item-KNN (k=20)...
Evaluating 52,642 users


Predicting:   9%|█████▉                                                         | 5000/52642 [12:03<1:34:56,  8.36it/s]

Processed 5000 users. Current Avg NDCG: 0.0830


Predicting:  19%|███████████▊                                                  | 10001/52642 [22:31<1:24:55,  8.37it/s]

Processed 10000 users. Current Avg NDCG: 0.0822


Predicting:  28%|█████████████████▋                                            | 15002/52642 [32:29<1:01:38, 10.18it/s]

Processed 15000 users. Current Avg NDCG: 0.0830


Predicting:  38%|████████████████████████▎                                       | 20000/52642 [42:32<51:49, 10.50it/s]

Processed 20000 users. Current Avg NDCG: 0.0885


Predicting:  47%|█████████████████████████████▍                                | 25001/52642 [52:01<1:07:31,  6.82it/s]

Processed 25000 users. Current Avg NDCG: 0.0935


Predicting:  57%|███████████████████████████████████▎                          | 30002/52642 [1:01:37<38:05,  9.90it/s]

Processed 30000 users. Current Avg NDCG: 0.1006


Predicting:  66%|█████████████████████████████████████████▏                    | 35001/52642 [1:10:45<35:55,  8.18it/s]

Processed 35000 users. Current Avg NDCG: 0.1078


Predicting:  76%|███████████████████████████████████████████████               | 40001/52642 [1:19:41<20:47, 10.13it/s]

Processed 40000 users. Current Avg NDCG: 0.1146


Predicting:  85%|█████████████████████████████████████████████████████         | 45002/52642 [1:28:28<13:27,  9.46it/s]

Processed 45000 users. Current Avg NDCG: 0.1206


Predicting:  95%|██████████████████████████████████████████████████████████▉   | 50001/52642 [1:37:05<04:33,  9.66it/s]

Processed 50000 users. Current Avg NDCG: 0.1263


Predicting: 100%|██████████████████████████████████████████████████████████████| 52642/52642 [1:41:24<00:00,  8.65it/s]

RESULTS:
Evaluated Users: 52,642
Average NDCG@20: 0.1292





In [85]:
def save_recommendations(df: pd.DataFrame, filename_base: str = "recommendations", output_dir: str = "."):

    os.makedirs(output_dir, exist_ok=True)
    
    csv_path = os.path.join(output_dir, f"{filename_base}.csv")
    df.to_csv(csv_path, index=False)

    txt_path = os.path.join(output_dir, f"{filename_base}.txt")
    
    with open(txt_path, 'w') as f:
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Writing TXT"):
            user_id = row['user_id']
            items_str = " ".join(map(str, row['recommended_items']))
            f.write(f"{user_id} {items_str}\n")
            
    print(f"Saved results as TXT")

In [87]:
save_recommendations(output_df, filename_base="results")

Writing TXT: 100%|█████████████████████████████████████████████████████████████| 52642/52642 [00:07<00:00, 7439.95it/s]

Saved results as TXT



