## Imports

In [77]:

import pandas as pd
import numpy as np
import random as rnd
from typing import Callable
import cupy as cp
import torch
from torch import nn
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pickle
import os
from sklearn.metrics import ndcg_score



## Initialization variables
# t_reduce_data_size responsible for reducing datasize and feasible for testing training and best hyperparameters training 
# reduce_data_size_n_times change the scale factor to reduce data size. Important: Some user ids for submission might be missing due to reduces size
# t_to_save_models if you want to save your results make it true
# t_to_load_models if you want to load your results make it true

In [78]:
t_reduce_data_size = True
reduce_data_size_n_times = 10
t_to_save_models = False
t_to_load_models = False
path_to_save_model = r'C:\Users\azatv\Jupyter\JupyterProjects\Learning User-Generated Data\recommendations\rec_models.pkl'
path_to_load_model = r'C:\Users\azatv\Jupyter\JupyterProjects\Learning User-Generated Data\recommendations\rec_models.pkl'
path_to_user_ids = r"C:\Users\azatv\Jupyter\JupyterProjects\Learning User-Generated Data\lfm-challenge\test_indices.txt"

## All the fucntions

In [79]:
def get_dcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK: int = 10) -> float:
    """
    predictions - 2D np.ndarray, predictions of the recommendation algorithm for each user;
    test_interaction_matrix - 2D np.ndarray, test interaction matrix for each user;
    
    returns - float, mean dcg score over all user;
    """
    num_users = predictions.shape[0]
    dcgs = np.zeros(num_users)

    for user_id in range(num_users):
        gains = np.zeros(topK)
        
        for rank in range(min(topK, predictions.shape[1])):
            item_id = predictions[user_id, rank]
            relevance = test_interaction_matrix[user_id, item_id]
            # 1/log(rank+1)
            if relevance == 1:
                if rank == 0:
                    gains[rank] = 1
                else:
                    gains[rank] = 1 / np.log2(rank + 1)
            
        dcgs[user_id] = np.sum(gains)

    mean_dcg = np.mean(dcgs)
    return mean_dcg

def get_ndcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    predictions - np.ndarray, predictions of the recommendation algorithm for each user;
    test_interaction_matrix - np.ndarray, test interaction matrix for each user;
    topK - int, topK recommendations should be evaluated;
    
    returns - float, average ndcg score over all users;
    """
    num_users = predictions.shape[0]
    ndcgs = np.zeros(num_users)

    for user_id in range(num_users):
        user_predictions = predictions[user_id, :topK]
        relevances = test_interaction_matrix[user_id, user_predictions]

        actual_topK = min(topK, len(user_predictions))
        
        if actual_topK > 1:
            discounts = np.log2(np.arange(2, actual_topK + 1))
            dcg = relevances[0] + np.sum(relevances[1:actual_topK] / discounts)
        else:
            dcg = relevances[0]

        sorted_relevances = np.sort(test_interaction_matrix[user_id])[-actual_topK:][::-1]
        if actual_topK > 1:
            idcg = sorted_relevances[0] + np.sum(sorted_relevances[1:actual_topK] / discounts)
        else:
            idcg = sorted_relevances[0] 
        
        if idcg == 0:
            ndcgs[user_id] = 0
        else:
            ndcgs[user_id] = dcg / idcg
    
    average_ndcg = np.mean(ndcgs)
    return average_ndcg


# REC Class
def inter_matr_implicit(users: pd.DataFrame,
                        items: pd.DataFrame,
                        interactions: pd.DataFrame,
                        dataset_name: str,
                        threshold=1) -> np.ndarray:
    res = None

    interactions = interactions.copy()

    # getting number of users and items from the respective files to be on the safe side
    n_users = len(users.index)
    n_items = len(items.index)
    print(f"users.index: {users.index}")
    print(f"items.index: {items.index}")
    # preparing the output matrix
    res = np.zeros([n_users, n_items], dtype=np.int8)
    print(f"res shape: {np.shape(res)}")
    # for every interaction assign 1 to the respective element of the matrix
    if dataset_name == 'lfm-ismir':
        inter_column_name = 'listening_events'
    elif dataset_name == 'ml-1m':
        inter_column_name = 'rating'
    elif dataset_name == 'lfm-tiny-tunes':
        inter_column_name = 'listening_events'
    elif dataset_name == 'lfm-challenge':
        inter_column_name = 'count'
    else:
        raise ValueError(f"Invalid dataset name: {dataset_name} ")
    print(f"dataset name: {inter_column_name}")
    print(f"user ids pd:{interactions['user_id'].head(10)}")
    print(f"item ids pd:{interactions['item_id'].head(10)}")
    print(f"listening_events pd:{interactions[inter_column_name].head(10)}")
    
    row = interactions["user_id"].to_numpy()
    col = interactions["item_id"].to_numpy()
    print(f"row user ids np:{row[:10]} shape: {np.shape(row)}")
    print(f"col item ids np:{col[:10]} shape: {np.shape(col)}")
    data = interactions[inter_column_name].to_numpy()
    print(f"listening_events np:{data[:10]}")

    data[data < threshold] = 0
    data[data >= threshold] = 1

    
    res[row, col] = data

    return res


def recTopKPop(inter_matr: np.ndarray,
               user: int,
               top_k: int) -> np.array:
    '''
    inter_matr - np.ndarray, from the task 1;
    user - int, user_id;
    top_k - int, expected length of the resulting list;

    returns - list/array, of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''


    top_pop = None

    item_pop = inter_matr.sum(axis=0)

    items_seen = np.nonzero(inter_matr[user])

    item_pop[items_seen] = 0

    top_pop = np.full((top_k,), -1)

    t_pop = (-item_pop).argsort()[:top_k]
    top_pop[:len(t_pop)] = t_pop

    return top_pop


def svd_decompose(inter_matr: np.ndarray, f: int = 50) -> (np.ndarray, np.ndarray):
    """
    inter_matr - np.ndarray, interaction matrix to construct svd from;
    f - int, expected size of embeddings;

    returns - 2D np.ndarray, U_final &  2D np.ndarray, V_final (as above) user-/item-embeddings of given length f;
    """

    U_final = None
    V_final = None


    U, s, Vh = np.linalg.svd(inter_matr, full_matrices=False)
    U_final = U[:, :f] @ np.diag(s[:f] ** 0.5)  # users x features
    V_final = (np.diag(s[:f] ** 0.5) @ Vh[:f, :]).T  # items x features

    return U_final, V_final


def svd_recommend_to_list(user_id: int, seen_item_ids: list, U: np.ndarray, V: np.ndarray, topK: int) -> np.ndarray:
    """
    Recommend with svd to selected users

    user_id - int, id of target user;
    seen_item_ids - list[int], ids of items already seen by the users (to exclude from recommendation);
    U and V - 2D np.ndarray & 2D np.ndarray, user- and item-embeddings;
    topK - int, number of recommendations per user to be returned;

    returns - np.ndarray, list of ids of recommended items in the order of descending score
                           use -1 as a place holder item index, when it is impossible to recommend topK items;
    """
    recs = None

    scores = U @ V.T
    u_scores = scores[user_id]
    u_scores[seen_item_ids] = -np.inf
    m = min(topK, scores.shape[1])
    recs = (-u_scores).argsort()[:m]

    return np.array(recs)


def jaccard_score(a: np.ndarray, b: np.ndarray) -> float:
    """
    a, b - 1D np.ndarray, vectors of the same length corresponding to the two items;

    returns - float, jaccard similarity score for a and b;
    """
    score = None
    c = a + b
    intersection = np.zeros_like(c)
    intersection[c > 1] = 1
    union = np.zeros_like(c)
    union[c >= 1] = 1

    score = np.sum(intersection) / np.sum(union)

    return float(score)


def calculate_sim_scores(similarity_measure: Callable[[np.ndarray, np.ndarray], float],
                         inter: np.ndarray,
                         target_vec: np.ndarray) -> np.ndarray:
    """
    similarity_measure - Callable, function that measures similarity, it gets called using your jaccard_score function from above - as always do not directly call your function, but use the passed parameter;
    inter - np.ndarray, interaction matrix - calculate similarity between each item and the target item (see below);
    target_vec - np.ndarray, target item vector;

    returns - np.ndarray, similarities between every item from <inter> and <target_vec> in the respective order;
    """

    item_similarities = None
    item_similarities = np.zeros((inter.shape[1],))

    for item in range(inter.shape[1]):
        inter_items = inter[:, item]
        item_similarities[item] = similarity_measure(inter_items, target_vec)

    return np.array(item_similarities)


def get_user_item_score(sim_scores_calculator: Callable[[Callable, np.array, np.array], np.array],
                        inter: np.array,
                        target_user: int,
                        target_item: int,
                        n: int = 2) -> float:
    """
    sim_scores_calculator - Callable, function that calculates similarities, using calculate_sim_scores
                                      from above, already defined in the next cell;
    inter - np.ndarray, interaction matrix;
    target_user - int, target user id;
    target_item - int, target item id;
    n - int, n closest neighbors to consider for the score prediction;

    returns - float, mean of similarity scores = user-item 'fitness' score;
    """

    item_similarities_mean = None
    inter_pred = inter.copy()

    # Get all items which were consumed by the user.
    item_consumed_by_user = inter_pred[target_user, :] == 1
    item_consumed_by_user[target_item] = False

    # get column of the target_item.
    inter_target_item = inter_pred[:, target_item]

    # create a mask to remove the user from the interaction matrix.
    not_user = np.full((inter_pred.shape[0],), True)
    not_user[target_user] = False

    # remove items not interacted with user
    inter_pred = inter_pred[:, item_consumed_by_user]

    # remove user
    inter_pred = inter_pred[not_user]
    inter_target_item = inter_target_item[not_user]

    # get closest items to target_item, which is at the last indices.
    scores = sim_scores_calculator(inter_pred, inter_target_item)

    # get items with the highes scores.
    scores_ids = np.argsort((- scores))
    scores = scores[scores_ids]

    scores = scores[:n]

    if len(scores) > 0:
        # calculate mean of normed scores.
        item_similarities_mean = scores.mean()
    else:
        item_similarities_mean = 0.0

    return item_similarities_mean


def sim_score_calc(inter, target_vec): return calculate_sim_scores(jaccard_score, inter, target_vec)


def user_item_scorer(inter, target_user, target_item, n): return get_user_item_score(sim_score_calc, inter,
                                                                                     target_user, target_item, n)


def _recTopK_base(user_item_scorer: Callable[[Callable, np.array, int, int], float],
                  inter_matr: np.array,
                  user: int,
                  top_k: int,
                  n: int) -> (np.array, np.array):
    '''
    user_item_scorer - Callable, wrapper function that calculates user-item score, using get_user_item_score function
                                 from above, already defined in the next cell;
    inter_matr - np.ndarray, interaction matrix;
    user - int,  user_id;
    top_k - int, expected length of the resulting list;
    n - int, number of neighbors to consider;

    returns - 1D np.ndarray, of recommendations (sorted in the order of descending scores) & 1D np.ndarray, of corresponding scores;
    '''

    top_rec = None
    scores = None

    

    scores = np.zeros((inter_matr.shape[1],))

    for item in range(inter_matr.shape[1]):
        if inter_matr[user, item] == 0:
            score = user_item_scorer(inter_matr, user, item, n)
            scores[item] = score

    top_rec = (- scores).argsort()[:top_k]
    scores = scores[top_rec]

    return np.array(top_rec), np.array(scores)


def recTopK(inter_matr: np.array,
            user: int,
            top_k: int,
            n: int) -> (np.array, np.array):
    return _recTopK_base(user_item_scorer, inter_matr, user, top_k, n)[0]


def reduce_matrix_size(matrix, reduction_factor=10):
    n_rows, n_cols = matrix.shape
    row_indices = np.random.choice(n_rows, n_rows // reduction_factor, replace=False)
    col_indices = np.random.choice(n_cols, n_cols // reduction_factor, replace=False)
    return matrix[np.ix_(row_indices, col_indices)]


def get_recommendations_for_algorithms(config: dict) -> dict:
    """
    config - dict, configuration as defined above;

    returns - dict, already predefined below with name "rec_dict";
    """

    #use this structure to return results
    rec_dict = {"recommenders": {
        "SVD": {
            #Add your predictions here
            "predictions": []
        },
        "ItemKNN": {
            "predictions": []
        },
        "TopPop": {
            "predictions": []
        },
    }}

    # SVD 
    try:
        U_final, V_final = svd_decompose(config['train_inter'], config['recommenders']['SVD']['n_factors'])
        for user_id in tqdm (range(config['train_inter'].shape[0]), desc="SVD..."):
            seen_item_ids = np.where(config['train_inter'][user_id] > 0)[0] 
            recommendations = svd_recommend_to_list(user_id, seen_item_ids, U_final, V_final, config['top_k'])
            rec_dict['recommenders']['SVD']['predictions'].append([recommendations])
        rec_dict['recommenders']['SVD']['predictions'] = np.vstack(rec_dict['recommenders']['SVD']['predictions'])
    except Exception as e:
        print("SVD Decomposition Failed:", str(e))

    # ItemKNN 
    try:
        for user_id in tqdm (range(config['train_inter'].shape[0]), desc="ItemKNN..."):
            recommendations = recTopK(config['train_inter'], user_id, config['top_k'], config['recommenders']['ItemKNN']['n_neighbours'])
            rec_dict['recommenders']['ItemKNN']['predictions'].append(recommendations)
        rec_dict['recommenders']['ItemKNN']['predictions'] = np.vstack(rec_dict['recommenders']['ItemKNN']['predictions'])
    except Exception as e:
        print("ItemKNN Recommendation Failed:", str(e))

    # TopPop 
    try:
        for user_id in tqdm (range(config['train_inter'].shape[0]), desc="TopPop..."):
            recommendations = recTopKPop(config['train_inter'], user_id, config['top_k'])
            rec_dict['recommenders']['TopPop']['predictions'].append(recommendations)
        rec_dict['recommenders']['TopPop']['predictions'] = np.vstack(rec_dict['recommenders']['TopPop']['predictions'])
    except Exception as e:
        print("TopPop Recommendation Failed:", str(e))
    if t_to_save_models: 
        with open(path_to_save_model, 'wb') as f:
            pickle.dump(rec_dict['recommenders'], f)

    return rec_dict

# Loading the models when needed
def load_models(filepath: str) -> dict:
    with open(filepath, 'rb') as f:
        models = pickle.load(f)
    return models


# Load the list of user IDs from the provided file, skipping the header
def load_test_user_ids(filepath):
    with open(filepath, 'r') as file:
        next(file)  # Skip the header line
        test_user_ids = [int(line.strip()) for line in file]
    return test_user_ids

def save_submission_files(config: dict, recommendations: dict, test_user_ids: list, matr_num: str, name: str):
    # Create the recommendations TSV file
    tsv_filename = f"rec_{matr_num}_{name}.tsv"
    with open(tsv_filename, 'w') as tsv_file:
        for user_id in test_user_ids:
            user_recommendations = recommendations['recommenders']['ItemKNN']['predictions'][user_id]
            user_recommendations_str = ','.join(map(str, user_recommendations))
            tsv_file.write(f"{user_id}\t{user_recommendations_str}\n")

    print(f"Submission files created: {tsv_filename}")


# def save_submission_files(config: dict, recommendations: dict, matr_num: str, name: str):
#     # Get the list of test users from the config
#     test_users = np.where(config['test_inter'].sum(axis=1) > 0)[0]

#     # Select the best recommender (ItemKNN in this case)
#     best_recommender = 'ItemKNN'
    
#     # Create the recommendations TSV file
#     tsv_filename = f"rec_{matr_num}_{name}.tsv"
#     with open(tsv_filename, 'w') as tsv_file:
#         for user_id in test_users:
#             user_recommendations = recommendations['recommenders'][best_recommender]['predictions'][user_id]
#             user_recommendations_str = ','.join(map(str, user_recommendations))
#             tsv_file.write(f"{user_id}\t{user_recommendations_str}\n")
#     if t_to_create_txt_file:
#         # Create the report TXT file
#         txt_filename = f"report_{matr_num}_{name}.txt"
#         with open(txt_filename, 'w') as txt_file:
#             txt_file.write(f"Matrix Number: {matr_num}\nName: {name}\n\n")
#             txt_file.write("Approach:\n")
#             txt_file.write(f"{best_recommender}:\n")
#             txt_file.write(f"Hyperparameters: {config['recommenders'][best_recommender]}\n")
#             txt_file.write(f"Description: Description of the approach used for {best_recommender}.\n\n")

#     print(f"Submission files created: {tsv_filename}")

# def read(dataset, file):
#     return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')




def get_recommendations_for_svd(config: dict) -> dict:
    """
    config - dict, configuration as defined above;

    returns - dict, already predefined below with name "rec_dict";
    """

    # Use this structure to return results
    rec_dict = {"recommenders": {
        "SVD": {
            # Add your predictions here
            "predictions": [],
            "n_factors": config['recommenders']['SVD']['n_factors']
        }
    }}

    # SVD 
    try:
        U_final, V_final = svd_decompose(config['train_inter'], config['recommenders']['SVD']['n_factors'])
        for user_id in tqdm(range(config['train_inter'].shape[0]), desc="SVD..."):
            seen_item_ids = np.where(config['train_inter'][user_id] > 0)[0]
            recommendations = svd_recommend_to_list(user_id, seen_item_ids, U_final, V_final, config['top_k'])
            rec_dict['recommenders']['SVD']['predictions'].append(recommendations)
        rec_dict['recommenders']['SVD']['predictions'] = np.array(rec_dict['recommenders']['SVD']['predictions'])
    except Exception as e:
        print("SVD Decomposition Failed:", str(e))

    with open(f"rec_svd_{config['recommenders']['SVD']['n_factors']}.pkl", 'wb') as f:
        pickle.dump(rec_dict['recommenders'], f)

    return rec_dict

def evaluate_svd(config: dict) -> float:
    """
    config - dict, configuration as defined above;

    returns - float, nDCG score for the SVD recommender;
    """

    svd_predictions = config['recommenders']['SVD']['predictions']
    svd_ndcg = get_ndcg_score(svd_predictions, config['test_inter'], config['top_k'])
    return svd_ndcg

def find_best_svd_factors(train_inter, test_inter, factors_list, top_k):
    best_factors = None
    best_ndcg = -1
    results = []

    for n_factors in factors_list:
        print(f"Evaluating SVD with n_factors={n_factors}")
        config_predict_svd = {
            "train_inter": train_inter,
            "top_k": top_k,
            "recommenders": {
                "SVD": {
                    "n_factors": n_factors
                }
            }
        }
        recommendations = get_recommendations_for_svd(config_predict_svd)
        config_test = {
            "top_k": top_k,
            "test_inter": test_inter,
            "recommenders": recommendations['recommenders']
        }
        ndcg = evaluate_svd(config_test)
        results.append((n_factors, ndcg))

        if ndcg > best_ndcg:
            best_ndcg = ndcg
            best_factors = n_factors

    print(f"Best n_factors: {best_factors} with nDCG: {best_ndcg}")
    for n_factors, ndcg in results:
        print(f"n_factors: {n_factors}, nDCG: {ndcg}")

    return best_factors, best_ndcg


def score_ndcg(recs, g_truth):
    predicted_scores = np.zeros(g_truth.shape[1])

    for i, rec in enumerate(recs):
        predicted_scores[rec] = len(recs) - i

    return ndcg_score(g_truth, predicted_scores.reshape(1, -1), k=len(recs))


def evaluate_algorithm(predictions, g_truth, top_k):
    """
    Evaluate a single algorithm using the given predictions and ground truth.

    predictions - np.ndarray, predicted item rankings for each user
    g_truth - np.ndarray, ground truth interaction matrix
    top_k - int, number of top recommendations to evaluate

    returns - float, nDCG score for the given algorithm
    """
    # Calculate the average nDCG score over all users
    ndcg_scores = []
    for user_id in range(g_truth.shape[0]):
        recs = predictions[user_id]
        ground_truth = g_truth[user_id].reshape(1, -1)
        ndcg = score_ndcg(recs, ground_truth)
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

def evaluate_algorithms(config: dict) -> dict:
    """
    config - dict, configuration as defined above;

    returns - dict, { Recommender Key from input dict: { "ndcg": float - ndcg from evaluation for this recommender} };
    """

    metrics = {
        "SVD": {
        },
        "ItemKNN": {
        },
        "TopPop": {
        },
    }

    # Calculate nDCG for SVD
    svd_predictions = config['recommenders']['SVD']['predictions']
    svd_ndcg = evaluate_algorithm(svd_predictions, config['test_inter'], config['top_k'])
    metrics["SVD"]["ndcg"] = svd_ndcg

    # Calculate nDCG for ItemKNN
    itemknn_predictions = config['recommenders']['ItemKNN']['predictions']
    itemknn_ndcg = evaluate_algorithm(itemknn_predictions, config['test_inter'], config['top_k'])
    metrics["ItemKNN"]["ndcg"] = itemknn_ndcg

    # Calculate nDCG for TopPop
    toppop_predictions = config['recommenders']['TopPop']['predictions']
    toppop_ndcg = evaluate_algorithm(toppop_predictions, config['test_inter'], config['top_k'])
    metrics["TopPop"]["ndcg"] = toppop_ndcg

    return metrics


def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

## Set up and initialization of interaction matrix

In [80]:
users = read('lfm-challenge','user')
items = read('lfm-challenge','item')
train_inters = read('lfm-challenge','inter_train')
test_inters = read('lfm-challenge','inter_test')

train_interaction_matrix = inter_matr_implicit(users=users, items=items, interactions=train_inters,
                                               dataset_name="lfm-challenge")
test_interaction_matrix = inter_matr_implicit(users=users, items=items, interactions=test_inters,
                                              dataset_name="lfm-challenge")

if t_reduce_data_size:
    # Reduce the size of the matrices by 1/10
    train_interaction_matrix = reduce_matrix_size(train_interaction_matrix, reduction_factor=reduce_data_size_n_times)
    test_interaction_matrix = reduce_matrix_size(test_interaction_matrix, reduction_factor=reduce_data_size_n_times)

    # Output the shapes of the reduced matrices to verify
    print(f"Reduced train interaction matrix shape: {train_interaction_matrix.shape}")
    print(f"Reduced test interaction matrix shape: {test_interaction_matrix.shape}")

users.index: RangeIndex(start=0, stop=2795, step=1)
items.index: RangeIndex(start=0, stop=4178, step=1)
res shape: (2795, 4178)
dataset name: count
user ids pd:0    100
1    100
2    100
3    100
4    100
5    100
6    100
7    100
8    100
9    100
Name: user_id, dtype: int64
item ids pd:0     341
1     393
2     331
3     343
4     326
5     345
6     342
7     332
8     340
9    2622
Name: item_id, dtype: int64
listening_events pd:0    7
1    2
2    2
3    6
4    2
5    5
6    5
7    3
8    8
9    2
Name: count, dtype: int64
row user ids np:[100 100 100 100 100 100 100 100 100 100] shape: (75860,)
col item ids np:[ 341  393  331  343  326  345  342  332  340 2622] shape: (75860,)
listening_events np:[7 2 2 6 2 5 5 3 8 2]
users.index: RangeIndex(start=0, stop=2795, step=1)
items.index: RangeIndex(start=0, stop=4178, step=1)
res shape: (2795, 4178)
dataset name: count
user ids pd:0    100
1    100
2    100
3    100
4    100
5    100
6    100
7    100
8    100
9    100
Name: user_id, d

## Configs
# config_predict responsible for hyperparameters tweaking 
# config_predict_svd for testing particularly svd
# config_test includes predictions of svd, itemknn and topPop recomenders

In [81]:
config_predict = {
    #interaction matrix
    "train_inter": train_interaction_matrix,
    #topK parameter used for all algorithms
    "top_k": 10,
    #specific parameters for all algorithms
    "recommenders": {
        "SVD": {
            "n_factors": 300
        },
        "ItemKNN": {
            "n_neighbours": 5
        },
        "TopPop": {
        }
    }
}

config_predict_svd = {
    #interaction matrix
    "train_inter": train_interaction_matrix,
    #topK parameter used for all algorithms
    "top_k": 10,
    #specific parameters for all algorithms
    "recommenders": {
        "SVD": {
            "n_factors": 300
        }
    }
}

config_test = {
    "top_k": 10,
    "test_inter": test_interaction_matrix,
    "recommenders": {}  # here you can access the recommendations from get_recommendations_for_algorithms

}



## Training SVD, ItemKNN and TopPop

In [82]:

recommendations = get_recommendations_for_algorithms(config_predict)

assert "SVD" in recommendations["recommenders"] and "predictions" in recommendations["recommenders"]["SVD"]
assert isinstance(recommendations["recommenders"]["SVD"]["predictions"], np.ndarray)
assert "ItemKNN" in recommendations["recommenders"] and "predictions" in recommendations["recommenders"]["ItemKNN"]
assert isinstance(recommendations["recommenders"]["ItemKNN"]["predictions"], np.ndarray)
assert "TopPop" in recommendations["recommenders"] and "predictions" in recommendations["recommenders"]["TopPop"]
assert isinstance(recommendations["recommenders"]["TopPop"]["predictions"], np.ndarray)


SVD...: 100%|██████████| 279/279 [00:00<00:00, 1555.08it/s]
  score = np.sum(intersection) / np.sum(union)
ItemKNN...: 100%|██████████| 279/279 [00:05<00:00, 55.31it/s]
TopPop...: 100%|██████████| 279/279 [00:00<00:00, 12759.63it/s]


## Loading all datasets

In [83]:
users = pd.read_csv('lfm-challenge/lfm-challenge.user', sep='\t')

print(users.info())
print(users.head())

items = pd.read_csv('lfm-challenge/lfm-challenge.item', sep='\t')

print(items.info())
print(items.head())

train_inters = pd.read_csv('lfm-challenge/lfm-challenge.inter_train', sep='\t')

print(train_inters.info())
print(train_inters.head())

test_inters = pd.read_csv('lfm-challenge/lfm-challenge.inter_test', sep='\t')

print(test_inters.info())
print(test_inters.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2795 entries, 0 to 2794
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   user_id              2795 non-null   int64 
 1   country              2264 non-null   object
 2   age_at_registration  2795 non-null   int64 
 3   gender               2789 non-null   object
 4   registration_date    2795 non-null   object
dtypes: int64(2), object(3)
memory usage: 109.3+ KB
None
   user_id country  age_at_registration gender    registration_date
0        0     NaN                   -1      n  2012-01-17 18:42:44
1        1     NaN                   -1      n  2011-03-24 13:27:26
2        2      US                   -1      m  2011-12-29 06:46:36
3        3     NaN                   -1      n  2012-04-16 11:21:04
4        4     NaN                   -1      n  2012-01-18 19:01:26
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4178 entries, 0 to 4177
Data columns

## Add predictions to config_test

In [84]:

# config_test = {
#     "top_k": 10,
#     "test_inter": test_interaction_matrix,
#     "recommenders": recommendations['recommenders']  # Access the recommendations from get_recommendations_for_algorithms
# }
# add dictionary with recommendations to config dictionary
config_test.update(recommendations)

## Evaluate based on the sklearn.metrics

In [85]:
evaluations = evaluate_algorithms(config_test)


if t_to_load_models:
    loaded_models = load_models(path_to_load_model)
    for recommender in loaded_models.keys():
        print(f"Loaded {recommender} model.")

## Check nDCG

In [86]:
# Output the evaluations
for recommender in evaluations.keys():
    print(f"{recommender} ndcg: {evaluations[recommender]['ndcg']}")

SVD ndcg: 0.005902152418822916
ItemKNN ndcg: 0.003962473332489676
TopPop ndcg: 0.011413092917180625


## Save .tsv submissions 

In [87]:


matr_num = "k00000000"  # Your matriculation number
name = "Joe_Biden"  # Your name

# Load test user IDs from the provided file
test_user_ids = load_test_user_ids(path_to_user_ids)

# Save the submission files
save_submission_files(config_test, recommendations, test_user_ids, matr_num, name)


IndexError: index 288 is out of bounds for axis 0 with size 279

## THE END

## Finding best hyperparameter for SVD
## IMPORTANT RUN ONLY IF YOU WANNA FIND BEST SVD

In [None]:

# Example usage
factors_list = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
top_k = 10

best_factors, best_ndcg = find_best_svd_factors(train_interaction_matrix, test_interaction_matrix, factors_list, top_k)



Evaluating SVD with n_factors=50


SVD...: 100%|██████████| 2795/2795 [01:37<00:00, 28.59it/s]


Evaluating SVD with n_factors=100


SVD...: 100%|██████████| 2795/2795 [01:59<00:00, 23.32it/s]


Evaluating SVD with n_factors=150


SVD...: 100%|██████████| 2795/2795 [02:22<00:00, 19.56it/s]


Evaluating SVD with n_factors=200


SVD...: 100%|██████████| 2795/2795 [02:41<00:00, 17.27it/s]


Evaluating SVD with n_factors=250


SVD...: 100%|██████████| 2795/2795 [03:03<00:00, 15.21it/s]


Evaluating SVD with n_factors=300


SVD...: 100%|██████████| 2795/2795 [03:25<00:00, 13.59it/s]


Evaluating SVD with n_factors=350


SVD...: 100%|██████████| 2795/2795 [03:47<00:00, 12.27it/s]


Evaluating SVD with n_factors=400


SVD...: 100%|██████████| 2795/2795 [04:10<00:00, 11.15it/s]


Evaluating SVD with n_factors=450


SVD...: 100%|██████████| 2795/2795 [04:32<00:00, 10.27it/s]


Evaluating SVD with n_factors=500


SVD...: 100%|██████████| 2795/2795 [04:52<00:00,  9.55it/s]

Best n_factors: 300 with nDCG: 0.168901641627851
n_factors: 50, nDCG: 0.1228218437052105
n_factors: 100, nDCG: 0.1437951333498403
n_factors: 150, nDCG: 0.15622232866597494
n_factors: 200, nDCG: 0.16168468844518286
n_factors: 250, nDCG: 0.16541059707291825
n_factors: 300, nDCG: 0.168901641627851
n_factors: 350, nDCG: 0.1681596852087657
n_factors: 400, nDCG: 0.1688111431433744
n_factors: 450, nDCG: 0.16792947005182346
n_factors: 500, nDCG: 0.1641889306768001
Best SVD factors: 300 with nDCG: 0.168901641627851





In [None]:
try:
    U_final, V_final = svd_decompose(config_predict['train_inter'], config_predict['recommenders']['SVD']['n_factors'])
    for user_id in tqdm (range(config_predict['train_inter'].shape[0]), desc="SVD..."):
        seen_item_ids = np.where(config_predict['train_inter'][user_id] > 0)[0] 
        get_recommendations_for_svd = svd_recommend_to_list(user_id, seen_item_ids, U_final, V_final, config_predict['top_k'])
        recommendations['recommenders']['SVD']['predictions'].append([get_recommendations_for_svd])
    recommendations['recommenders']['SVD']['predictions'] = np.vstack(recommendations['recommenders']['SVD']['predictions'])
except Exception as e:
    print("SVD Decomposition Failed:", str(e))

{'recommenders': {'SVD': {'predictions': array([[  95,  932, 2698, ..., 2243, 3215,  247],
          [ 193, 1553,  233, ..., 1318,   11, 2786],
          [ 473, 1171,  599, ..., 1088,  957, 2053],
          ...,
          [1978, 2315,  187, ..., 1435, 2279, 2697],
          [2170, 2662, 3305, ..., 1975,   75,  907],
          [2052,  204, 1192, ..., 1590, 2298, 1260]], dtype=int64)},
  'ItemKNN': {'predictions': array([[3215, 3505, 2245, ..., 1918, 2243, 1739],
          [2786, 1992, 3489, ...,  233, 2327, 1968],
          [ 473, 1731, 2191, ..., 1171, 1839,   77],
          ...,
          [1978, 2315, 3111, ..., 2310,  445, 2594],
          [4027, 2170, 2662, ..., 2344, 3087, 1665],
          [2530, 1757,   51, ..., 2259, 2254, 3620]], dtype=int64)},
  'TopPop': {'predictions': array([[2108, 1667, 1376, ...,  323, 1059,  803],
          [2108, 1667, 1376, ...,  323,  324, 1616],
          [2108, 1667,  892, ...,  301, 1616, 1059],
          ...,
          [2108, 1667,  892, ...,  301,