In [187]:
import pandas as pd

from src.utils import read_pickles
from sklearn.preprocessing import LabelEncoder



In [188]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [189]:
def train_test_split(df, split_date):
    train = df[df["Date"] < split_date][["UserID", "MovieID", "Rating"]]
    test = df[df["Date"] >= split_date][["UserID", "MovieID", "Rating"]]
    return train, test

split_date = pd.to_datetime("2000-12-02").date()
train, test = train_test_split(df_ratings, split_date)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (797116, 3)
Test shape: (203093, 3)


In [190]:
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Date
0,1,1193,5,978300760,2000-12-31 22:12:40,2000-12-31
1,1,661,3,978302109,2000-12-31 22:35:09,2000-12-31
2,1,914,3,978301968,2000-12-31 22:32:48,2000-12-31
3,1,3408,4,978300275,2000-12-31 22:04:35,2000-12-31
4,1,2355,5,978824291,2001-01-06 23:38:11,2001-01-06


In [191]:
# Encode user and movie ids
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_ratings['UserID'] = user_encoder.fit_transform(df_ratings['UserID'])
df_ratings['MovieID'] = movie_encoder.fit_transform(df_ratings['MovieID'])

# Create user-item interaction matrix
# interaction_matrix = csr_matrix(
#     (df_ratings['UserID'], df_ratings['MovieID'])
# )

rating_matrix = train.pivot_table(index='UserID', columns='MovieID', \
                                  values='Rating', fill_value=0)
rating_matrix.head(3)

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import mean_squared_error

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_ratings['UserID'] = user_encoder.fit_transform(df_ratings['UserID'])
df_ratings['MovieID'] = movie_encoder.fit_transform(df_ratings['MovieID'])

interaction_matrix = csr_matrix(
    (df_ratings['Rating'], (df_ratings['UserID'], df_ratings['MovieID']))
)

class AlternatingLeastSquares:
    def __init__(self, num_factors=10, regularization=0.1, iterations=10):
        self.num_factors = num_factors
        self.regularization = regularization
        self.iterations = iterations

    def fit(self, interaction_matrix):
        self.num_users, self.num_items = interaction_matrix.shape
        self.user_factors = np.random.random((self.num_users, self.num_factors))
        self.item_factors = np.random.random((self.num_items, self.num_factors))

        for iteration in range(self.iterations):
            self.user_factors = self._als_step(interaction_matrix, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(interaction_matrix.T, self.item_factors, self.user_factors)


    def _als_step(self, interaction_matrix,  update_vecs, fixed_vecs):
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_factors) * self.regularization
        b = interaction_matrix.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        update_vecs = b.dot(A_inv)
        return update_vecs


    def predict(self, user_id):      
        predictions =  self.user_factors.dot(self.item_factors.T)
        print(predictions.shape)
        return predictions[user_id]


als = AlternatingLeastSquares(num_factors=10, regularization=0.1, iterations=10)

als.fit(interaction_matrix)


In [193]:
def find_recommendations(user_id, number_of_recommendations=15):
    predictions = als.predict(user_id)
    high_score_movie_ids = np.argsort(predictions)[::-1][:number_of_recommendations]
    items_to_recommend = df_movies.where(df_movies['MovieID'].isin(high_score_movie_ids)).dropna()
    print(items_to_recommend[['Title', 'Genres']])

## We would recommend following items to user with id 20

In [195]:
find_recommendations(20, 15)

(6040, 3706)
                                            Title  \
250                    Interview with the Vampire   
570                           Spanking the Monkey   
577                         Celluloid Closet, The   
1090                             Leopard Son, The   
1091                                        Loser   
1104                  People vs. Larry Flynt, The   
2093  NeverEnding Story II: The Next Chapter, The   
2305                                      Gung Ho   
2582              Frankenstein Meets the Wolf Man   
2706                                      Head On   
2716                              Tales of Terror   
2829                               Dark Half, The   
2889                             Naturally Native   
3481                                  Hunger, The   

                                Genres  
250                    [Drama, Horror]  
570                    [Comedy, Drama]  
577                      [Documentary]  
1090                     [D