In [104]:
import pandas as pd
import scipy.sparse as sp
import numpy as np
from pathlib import Path
from typing import Tuple, Dict

In [106]:
INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = INPUT_DIR / f"data_k{K_VAL}_train.txt"
TEST_PATH = INPUT_DIR / f"data_k{K_VAL}_test.txt"

In [108]:
def load_data_and_create_matrices(  train_file: Path,
                                    test_file: Path ) -> Tuple[sp.csr_matrix, sp.csr_matrix, Dict[int, int], Dict[int, int]]:

    raw_data = []

    def parse_file(filepath: Path, dataset_type: str) -> None:
        if not filepath.exists():
            raise FileNotFoundError(f"File not found: {filepath}")

        with filepath.open("r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue

                u_id = int(parts[0])
                items = [int(i) for i in parts[1:]]

                for i_id in items:
                    raw_data.append(
                        {
                            "user": u_id,
                            "item": i_id,
                            "type": dataset_type,  # "train" or "test"
                        }
                    )

    parse_file(train_file, "train")
    parse_file(test_file, "test")

    df = pd.DataFrame(raw_data)
    print(f"Loaded {len(df):,} total interactions.")

    df["user_idx"] = df["user"].astype("category").cat.codes
    df["item_idx"] = df["item"].astype("category").cat.codes

    # Internal index -> original ID
    user_map: Dict[int, int] = dict(zip(df["user_idx"], df["user"]))
    item_map: Dict[int, int] = dict(zip(df["item_idx"], df["item"]))

    n_users = len(user_map)
    n_items = len(item_map)

    print(f"Matrix dimensions: {n_users:,} users x {n_items:,} items")

    def build_csr(dataset_type: str) -> sp.csr_matrix:
        subset = df[df["type"] == dataset_type]

        rows = subset["user_idx"].values
        cols = subset["item_idx"].values
        data = np.ones(len(subset), dtype=np.float32)

        return sp.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

    train_matrix = build_csr("train")
    test_matrix = build_csr("test")

    print(f"Train nnz: {train_matrix.nnz:,} | Test nnz: {test_matrix.nnz:,}")

    return train_matrix, test_matrix, user_map, item_map


In [110]:
import numpy as np
import pandas as pd

def evaluate_popularity_model(train_matrix, test_matrix, user_map, item_map, top_k=20):
    """
    Trains a Popularity Baseline model and evaluates it using NDCG@K.
    """
    print("Calculating item popularity")

    global_item_scores = np.array(train_matrix.sum(axis=0)).flatten()

    top_idx = global_item_scores.argmax()

    # 2. Evaluation Loop
    ndcg_scores = []
    output_rows = []

    test_users = np.unique(test_matrix.nonzero()[0])

    print(f"Processing {len(test_users):,} users with test data")

    for user_idx in test_users:
        
        user_scores = global_item_scores.copy()

        known_indices = train_matrix[user_idx].indices
        user_scores[known_indices] = -np.inf
        
        top_indices = user_scores.argsort()[-top_k:][::-1]
        
        true_items = test_matrix[user_idx].indices
        
        relevance = np.asfarray([1 if x in true_items else 0 for x in top_indices])
        
        if len(relevance) > 0:
            dcg = np.sum(relevance / np.log2(np.arange(2, relevance.size + 2)))
            ideal_rel = np.ones(min(len(true_items), top_k))
            idcg = np.sum(ideal_rel / np.log2(np.arange(2, ideal_rel.size + 2)))
            score = (dcg / idcg) if idcg > 0 else 0
        else:
            score = 0
            
        ndcg_scores.append(score)

        real_user_id = user_map[user_idx]
        real_item_ids = [item_map[i] for i in top_indices]
        
        output_rows.append({
            'user_id': real_user_id,
            'recommended_items': real_item_ids
        })

    avg_ndcg = np.mean(ndcg_scores)
    print(f"RESULT:")
    print(f"Average NDCG@{top_k}: {avg_ndcg:.4f}")
    
    return pd.DataFrame(output_rows), avg_ndcg

## Min Interactions = 2

In [113]:
K_VAL = 2

INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = INPUT_DIR / f"data_k{K_VAL}_train.txt"
TEST_PATH = INPUT_DIR / f"data_k{K_VAL}_test.txt"

train_matrix, test_matrix, user_map, item_map = load_data_and_create_matrices(TRAIN_PATH, TEST_PATH)

Loaded 2,379,949 total interactions.
Matrix dimensions: 52,643 users x 90,818 items
Train nnz: 1,924,114 | Test nnz: 455,835


In [114]:
output_df, score = evaluate_popularity_model(train_matrix, test_matrix, user_map, item_map, top_k=20)

Calculating item popularity
Processing 52,643 users with test data
RESULT:
Average NDCG@20: 0.0079


## Min interaction 3

In [116]:
K_VAL = 3

INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = INPUT_DIR / f"data_k{K_VAL}_train.txt"
TEST_PATH = INPUT_DIR / f"data_k{K_VAL}_test.txt"


train_matrix, test_matrix, user_map, item_map = load_data_and_create_matrices(TRAIN_PATH, TEST_PATH)

Loaded 2,378,453 total interactions.
Matrix dimensions: 52,643 users x 90,070 items
Train nnz: 1,922,908 | Test nnz: 455,545


In [117]:
output_df, score = evaluate_popularity_model(train_matrix, test_matrix, user_map, item_map, top_k=20)

Calculating item popularity
Processing 52,643 users with test data
RESULT:
Average NDCG@20: 0.0080


## Min interaction 5

In [119]:
K_VAL = 5

INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = INPUT_DIR / f"data_k{K_VAL}_train.txt"
TEST_PATH = INPUT_DIR / f"data_k{K_VAL}_test.txt"


train_matrix, test_matrix, user_map, item_map = load_data_and_create_matrices(TRAIN_PATH, TEST_PATH)

Loaded 2,372,615 total interactions.
Matrix dimensions: 52,642 users x 88,416 items
Train nnz: 1,918,235 | Test nnz: 454,380


In [120]:
output_df, score = evaluate_popularity_model(train_matrix, test_matrix, user_map, item_map, top_k=20)

Calculating item popularity
Processing 52,642 users with test data
RESULT:
Average NDCG@20: 0.0081
