In [21]:
import pandas as pd
from src.utils import TrainTestSplitter, read_pickles
from src.models import ItemItemModel, BaseModelAverage
from src.metrics import ml_metrics, predictive_metrics, rank_metrics
from tqdm import tqdm
tqdm.pandas()

In [4]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [5]:
train, test = TrainTestSplitter.split_by_percent(df_ratings, 0.8)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

true_scores = test["Rating"]

base_model = BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {predictive_metrics(test, predicted_scores, k=5)}")
print(f"Rank merics: {rank_metrics(test, predicted_scores, k=1, threshold=5)}")

Train shape: (800167, 6)
Test shape: (200042, 6)
ML merics: {'mae': 0.785, 'rmse': 0.985, 'precision': 0.805, 'recall': 0.361, 'f1': 0.498, 'roc_auc': 0.617}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.869, 'recall_at_k': 0.101, 'avrg_prec_at_k': 0.869, 'n_users_with_k': 1246}
Rank merics: {'mean_reciprocal_rank': 2.034, 'hit_rate': 0.903}


# Item - Item 

In [7]:
item_item_model = ItemItemModel(num_of_similar_users=300)
item_item_model.fit(rading_df=df_ratings)
predicted_scores_for_each_film = item_item_model.predict(user_id=20, film_id=100)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## We take only first 10 000 items to make predictions faster

In [17]:
X_test = test[["UserID", "MovieID"]][:10000]
y_test = test["Rating"][:10000]

def predict_wrapper(row):
    user_id = row["UserID"]
    movie_id = row["MovieID"]
    return item_item_model.predict(user_id, movie_id)

predict = X_test.progress_apply(predict_wrapper, axis=1).tolist()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 10000/10000 [00:20<00:00, 492.55it/s]


In [22]:
y_pred = pd.DataFrame(predict, columns=["Rating"])

In [27]:
print(f"ML merics: {ml_metrics(y_test, y_pred)}")

ML merics: {'mae': 1.057, 'rmse': 1.44, 'precision': 0.624, 'recall': 0.948, 'f1': 0.752, 'roc_auc': 0.531}
