In [2]:
import pandas as pd
from src.utils import TrainTestSplitter, read_pickles
from src.models import ItemItemModel, BaseModelAverage
from src.metrics import ml_metrics, predictive_metrics, rank_metrics
from tqdm import tqdm
tqdm.pandas()

In [3]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [4]:
train, test = TrainTestSplitter.split_by_percent(df_ratings, 0.8)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

true_scores = test["Rating"]

base_model = BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {predictive_metrics(test, predicted_scores, k=5)}")
print(f"Rank merics: {rank_metrics(test, predicted_scores, k=1, threshold=5)}")

# Item - Item 

In [5]:
item_item_model = ItemItemModel(num_of_similar_items=300)
item_item_model.fit(rading_df=df_ratings)
predicted_scores_for_each_film, indexes_of_similar_items = item_item_model.predict(user_id=20, film_id=100)

In [6]:
predicted_scores_for_each_film

## We take only first 10 000 items to make predictions faster

In [18]:
X_test = test[["UserID", "MovieID"]][:10000]
y_test = test["Rating"][:10000]

def predict_wrapper(row):
    user_id = row["UserID"]
    movie_id = row["MovieID"]
    predict, indexes_of_similar_items =  item_item_model.predict(user_id, movie_id)
    return predict

predict = X_test.progress_apply(predict_wrapper, axis=1).tolist()

In [23]:
y_pred = pd.DataFrame(predict, columns=["Rating"])
y_pred.fillna(0, inplace=True)

In [26]:
print(f"ML merics: {ml_metrics(y_test, y_pred)}")
print(f"Predictive merics: {predictive_metrics(test, y_pred, k=5)}")
print(f"Rank merics: {rank_metrics(test, y_pred, k=4, threshold=3)}")