In [197]:
from src.utils import TrainTestSplitter
from src.models import  BaseModelAverage
from src.metrics import ml_metrics, predictive_metrics, rank_metrics

from src.utils import read_pickles

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [198]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [199]:
train, test = TrainTestSplitter.split_by_percent(df_ratings, 0.8)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

true_scores = test["Rating"]

base_model = BaseModelAverage()
base_model.fit(train)
predicted_scores = base_model.predict(test["MovieID"])

# evaluate
print(f"ML merics: {ml_metrics(true_scores, predicted_scores)}")
print(f"Predictive merics: {predictive_metrics(test, predicted_scores, k=5)}")
print(f"Rank merics: {rank_metrics(test, predicted_scores, k=1, threshold=5)}")

Train shape: (800167, 6)
Test shape: (200042, 6)
ML merics: {'mae': 0.785, 'rmse': 0.985, 'precision': 0.805, 'recall': 0.361, 'f1': 0.498, 'roc_auc': 0.617}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.869, 'recall_at_k': 0.101, 'avrg_prec_at_k': 0.869, 'n_users_with_k': 1246}
Rank merics: {'mean_reciprocal_rank': 2.034, 'hit_rate': 0.903}


In [200]:
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Date
0,1,1193,5,978300760,2000-12-31 22:12:40,2000-12-31
1,1,661,3,978302109,2000-12-31 22:35:09,2000-12-31
2,1,914,3,978301968,2000-12-31 22:32:48,2000-12-31
3,1,3408,4,978300275,2000-12-31 22:04:35,2000-12-31
4,1,2355,5,978824291,2001-01-06 23:38:11,2001-01-06


In [201]:
# Encode user and movie ids
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_ratings['UserID'] = user_encoder.fit_transform(df_ratings['UserID'])
df_ratings['MovieID'] = movie_encoder.fit_transform(df_ratings['MovieID'])

rating_matrix = train.pivot_table(index='UserID', columns='MovieID', \
                                  values='Rating', fill_value=0)
rating_matrix.head(3)

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:


class AlternatingLeastSquares:
    def __init__(self, num_factors=10, regularization=0.1, iterations=10):
        self.num_factors = num_factors
        self.regularization = regularization
        self.iterations = iterations

    def fit(self, interaction_matrix):
        self.num_users, self.num_items = interaction_matrix.shape
        self.user_factors = np.random.random((self.num_users, self.num_factors))
        self.item_factors = np.random.random((self.num_items, self.num_factors))

        for iteration in range(self.iterations):
            self.user_factors = self._als_step(interaction_matrix, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(interaction_matrix.T, self.item_factors, self.user_factors)


    def _als_step(self, interaction_matrix, update_vecs, fixed_vecs):
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_factors) * self.regularization
        b = interaction_matrix.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        update_vecs = b.dot(A_inv)
        return update_vecs


    def predict(self, user_id):      
        predictions =  self.user_factors.dot(self.item_factors.T)
        print(predictions.shape)
        return predictions[user_id]


als = AlternatingLeastSquares(num_factors=10, regularization=0.1, iterations=10)

als.fit(train)

TypeError: unsupported operand type(s) for *: 'Timestamp' and 'float'

In [203]:
def find_recommendations(user_id, number_of_recommendations=15):
    predictions = als.predict(user_id)
    high_score_movie_ids = np.argsort(predictions)[::-1][:number_of_recommendations]
    items_to_recommend = df_movies.where(df_movies['MovieID'].isin(high_score_movie_ids)).dropna()
    print(items_to_recommend[['Title', 'Genres']])

## We would recommend following items to user with id 20

In [204]:
find_recommendations(20, 15)

(4795, 3685)
                                     Title               Genres
538                             Son in Law             [Comedy]
1633                 Chairman of the Board             [Comedy]
1871                 Gentleman's Agreement              [Drama]
2046  Indiana Jones and the Temple of Doom  [Action, Adventure]
2234                             Nashville     [Drama, Musical]
2787                    I Saw What You Did           [Thriller]
2839                        Boys Don't Cry              [Drama]
2962                           Repossessed             [Comedy]
3548                             Road Trip             [Comedy]
3556       Better Living Through Circuitry        [Documentary]


In [206]:
train.shape

(800167, 6)

In [208]:
test[test["UserID"] == 20]

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Date


In [209]:
find_recommendations(20, 15)


(4795, 3685)
                                     Title               Genres
538                             Son in Law             [Comedy]
1633                 Chairman of the Board             [Comedy]
1871                 Gentleman's Agreement              [Drama]
2046  Indiana Jones and the Temple of Doom  [Action, Adventure]
2234                             Nashville     [Drama, Musical]
2787                    I Saw What You Did           [Thriller]
2839                        Boys Don't Cry              [Drama]
2962                           Repossessed             [Comedy]
3548                             Road Trip             [Comedy]
3556       Better Living Through Circuitry        [Documentary]
