In [1]:
import numpy as np
import pandas as pd

from src import utils, models, metrics

In [2]:
# read data
df_movies, df_users, df_ratings = utils.read_pickles("../../data/ml-1m-after_eda/")

#### 1 Ordered splitting. 80% data in train

In [3]:
# split data
train, test = utils.TrainTestSplitter.split_by_percent(df_ratings, 0.8)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# get true
true_scores = test["Rating"]

# get predict
base_model = models.BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {metrics.ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {metrics.predictive_metrics(test, predicted_scores, k=5, threshold=4)}")
print(f"Rank merics: {metrics.rank_metrics(test, predicted_scores, k=5, threshold=4)}")

Train shape: (800167, 6)
Test shape: (200042, 6)
ML merics: {'mae': 0.785, 'rmse': 0.985, 'precision': 0.805, 'recall': 0.361, 'f1': 0.498, 'roc_auc': 0.617}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.869, 'recall_at_k': 0.101, 'avrg_prec_at_k': 0.869, 'n_users_with_k': 1246}
Rank merics: {'mean_reciprocal_rank': 1.182, 'hit_rate': 0.997}


#### 2 Random splitting (only for experiment). 80% data in train

In [4]:
# split data
train, test = utils.TrainTestSplitter.split_by_percent(df_ratings, 0.8, random_split=True)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# get true
true_scores = test["Rating"]

# get predict
base_model = models.BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {metrics.ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {metrics.predictive_metrics(test, predicted_scores, k=5, threshold=4)}")
print(f"Rank merics: {metrics.rank_metrics(test, predicted_scores, k=5, threshold=4)}")

Train shape: (800167, 6)
Test shape: (200042, 6)
ML merics: {'mae': 0.782, 'rmse': 0.98, 'precision': 0.811, 'recall': 0.334, 'f1': 0.473, 'roc_auc': 0.614}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.795, 'recall_at_k': 0.393, 'avrg_prec_at_k': 0.851, 'n_users_with_k': 5584}
Rank merics: {'mean_reciprocal_rank': 1.206, 'hit_rate': 0.99}


#### 3 Users based splitting. 10 reviews for each user in test

In [5]:
# split data
train, test = utils.TrainTestSplitter.split_by_users(df_ratings, n_reviews_in_test=10)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# get true
true_scores = test["Rating"]

# get predict
base_model = models.BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {metrics.ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {metrics.predictive_metrics(test, predicted_scores, k=5, threshold=4)}")
print(f"Rank merics: {metrics.rank_metrics(test, predicted_scores, k=5, threshold=4)}")

Train shape: (939809, 6)
Test shape: (60400, 6)
ML merics: {'mae': 0.793, 'rmse': 0.999, 'precision': 0.814, 'recall': 0.438, 'f1': 0.569, 'roc_auc': 0.628}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.757, 'recall_at_k': 0.61, 'avrg_prec_at_k': 0.806, 'n_users_with_k': 6040}
Rank merics: {'mean_reciprocal_rank': 1.253, 'hit_rate': 0.993}
