In [1]:
import pandas as pd
import numpy as np
from surprise import dump
import os
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
from datetime import datetime
from collections import defaultdict

# load matrix and data

In [15]:
MODEL_DIR = os.path.abspath('25k-user-based-algo.dump')
_, CF_model = dump.load(MODEL_DIR)


SVD_MODEL_DIR = os.path.abspath('SVD_model_subset.dump')
_, SVD_model = dump.load(SVD_MODEL_DIR)

train_data = pd.read_csv("rating_dict.csv")

df = pd.read_csv("../ratings.csv",index_col=False).reindex(columns=["movie", "user_id", "rating"]).rename(columns={"user_id":"user"})
print(df.shape)
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user", "movie", "rating"]], reader)
print(train_data.head(3))
for i in range(3):
    print(data.raw_ratings[i])

(907085, 3)
                                           movie    user  rating
0                        3+ninjas+kick+back+1994   10000       3
1                            monsters_+inc.+2001  100002       4
2  more+about+the+children+of+noisy+village+1987  100002       4
(523475, 'prison+break+the+final+break+2009', 4.0, None)
(906340, 'the+warlords+2007', 3.0, None)
(474539, 'the+unknown+woman+2011', 4.0, None)


# calculate RMSE 


In [3]:
train, test = train_test_split(data, test_size=0.2)
# test_df = test_df.drop(columns=["rating"]).reindex(columns=[ "user", "movie"])

CF_pred = CF_model.test(test)
SVD_pred = SVD_model.test(test)
CF_rmse = accuracy.mae(CF_pred,verbose=False)
SVD_rmse = accuracy.mae(SVD_pred,verbose=False)
print("CF's rmse: ", CF_rmse)
print("SVD's rmse: ", SVD_rmse)



CF's rmse:  0.601897537586233
SVD's rmse:  0.5814020326287207


# calculate accuracy

In [4]:
tolerance = 0.25
cf_correct = sum(abs(prediction.est - prediction.r_ui) <= tolerance for prediction in CF_pred)
cf_acc = cf_correct / len(CF_pred)
print(f'CF Model accuracy: {cf_acc * 100:.2f}%')
svd_correct = sum(abs(prediction.est - prediction.r_ui) <= tolerance for prediction in SVD_pred)
svd_acc = svd_correct / len(CF_pred)
print(f'SVD Model Accuracy: {svd_acc * 100:.2f}%')

CF Model accuracy: 3.32%
SVD Model Accuracy: 19.58%


# test training


In [5]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_data[["user", "movie", "rating"]], reader)
###### from model teams' code ######
trainingSet = data.build_full_trainset()
sim_options = {
    "name": "cosine",
    "user_based": True,
}
CF_algo = KNNWithMeans(sim_options=sim_options)

start_time = datetime.now()

CF_algo.fit(trainingSet) 

end_time = datetime.now()

print('CF Model trainning time: ', end_time - start_time)

Computing the cosine similarity matrix...
Done computing similarity matrix.
CF Model trainning time:  0:00:06.107910


In [6]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_data[["user", "movie", "rating"]], reader)
###### from model teams' code ######
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

start_time = datetime.now()

gs.fit(data)

end_time = datetime.now()

print('SVD Model trainning time: ', end_time - start_time)

SVD Model trainning time:  0:00:08.056400


# inference cost


In [21]:
def CF_model_rec(allset, user_id_int):
    
    inner_user_id = allset.to_inner_uid(user_id_int)
    print(inner_user_id)
    neighbors = CF_model.get_neighbors(inner_user_id, k=10)

    all_ratings = list(allset.all_ratings())
    
    target_user_ratings = [all_ratings[inner_user_id]]
    target_user_items = [iid for (uid, iid, r) in target_user_ratings]
    
    neighbor_items = []
    for neighbor in neighbors:
        neighbor_ratings = [all_ratings[neighbor]]
        for (uid, iid, r) in neighbor_ratings:
            neighbor_items.append(iid)

    unwatched_items = set(neighbor_items) - set(target_user_items)
    # Predict the ratings for all unwatched items for the given user
    predictions = [CF_model.predict(user_id_int, allset.to_raw_iid(iid)).est for iid in unwatched_items]

    predictions_with_item_id = list(zip(unwatched_items, predictions))
    predictions_with_item_id.sort(key=lambda x: x[1], reverse=True)
    recommended_items = [(allset.to_raw_iid(item_id), score) for (item_id, score) in predictions_with_item_id]
    return recommended_items

In [31]:
train_data = pd.read_csv("rating_dict.csv")
all_df = train_data
reader = Reader(rating_scale=(1, 5))
all_data = Dataset.load_from_df(all_df[["user", "movie", "rating"]], reader)
allset = all_data.build_full_trainset()

start_time = datetime.now()

rec = CF_model_rec(allset, 100002)

end_time = datetime.now()
print(rec)
print('CF Model trainning time: ', end_time - start_time)

1
[('the+hobbit+an+unexpected+journey+2012', 4.4), ('for+the+bible+tells+me+so+2007', 4.0), ('look_+up+in+the+sky+the+amazing+story+of+superman+2006', 4.0), ('110901+-+september+11+2002', 4.0), ('papadopoulos++sons+2012', 4.0), ('paperman+2012', 4.0), ('dead+ahead+the+exxon+valdez+disaster+', 4.0), ('hit+man+1972', 4.0), ('the+goonies+1985', 4.0), ('harry+potter+and+the+deathly+hallows+part+2+2011', 3.8333333333333335)]
CF Model trainning time:  0:00:00.042038


In [25]:
def SVD_model_rec(all_df, user_id_int):
        ####### SVD model ##########
    test_user = user_id_int
    df = all_df
    user_ids = df["user"].unique().tolist()
    user2user_encoded = {x: i for i, x in enumerate(user_ids)}
    userencoded2user = {i: x for i, x in enumerate(user_ids)}

    movie_ids = df["movie"].unique().tolist()
    movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
    movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

    df["user"] = df["user"].map(user2user_encoded)
    df["movie_id"] = df["movie"].map(movie2movie_encoded)

    df["rating"] = df["rating"].values.astype(np.float32)
    movies_watched_by_user = df[df.user == test_user]

    # Get movie_id of movies_not_watched
    movies_not_watched = df[~df["movie"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
    movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.values())))
    # Get user_id of the sample user
    user_encoder = user2user_encoded.get(test_user)
    # Let the model predict the rating of the movies in movies_not_watched
    ratings = np.array([SVD_model.predict(test_user,movie_encoded2movie[i]).est for i in movies_not_watched])
    # Get the top 20 movies
    top_ratings_indices = ratings.argsort()[-20:][::-1]
    recommended_movies = [(movie_encoded2movie.get(movies_not_watched[x]),ratings[x]) for x in top_ratings_indices]

    recommend = []
    for movie, score in recommended_movies:
        recommend.append([movie, float(round(score))])

    return recommend

In [30]:
train_data = pd.read_csv("../ratings.csv",index_col=False).reindex(columns=["movie", "user_id", "rating"]).rename(columns={"user_id":"user"})
all_df = train_data

start_time = datetime.now()

rec = SVD_model_rec(all_df, 100002)

end_time = datetime.now()
print(rec)
print('SVD Model trainning time: ', end_time - start_time)

[['the+shawshank+redemption+1994', 4.0], ['the+godfather+1972', 4.0], ['the+lord+of+the+rings+the+return+of+the+king+2003', 4.0], ['one+flew+over+the+cuckoos+nest+1975', 4.0], ['fight+club+1999', 4.0], ['pulp+fiction+1994', 4.0], ['goodfellas+1990', 4.0], ['the+lord+of+the+rings+the+fellowship+of+the+ring+2001', 4.0], ['the+lord+of+the+rings+the+two+towers+2002', 4.0], ['alien+1979', 4.0], ['monty+python+and+the+holy+grail+1975', 4.0], ['aliens+1986', 4.0], ['the+dark+knight+2008', 4.0], ['spirited+away+2001', 4.0], ['senna+2010', 4.0], ['seven+samurai+1954', 4.0], ['saving+private+ryan+1998', 4.0], ['wild+china+2008', 4.0], ['the+ascent+1977', 4.0], ['the+good_+the+bad+and+the+ugly+1966', 4.0]]
SVD Model trainning time:  0:00:00.971410


# file size


In [32]:
CF_size = os.path.getsize('25k-user-based-algo.dump')

SVD_size = os.path.getsize('SVD_model_subset.dump')

print("CF model size: {} bytes".format(CF_size))
print("SVD model size: {} bytes".format(SVD_size))

CF model size: 5002059951 bytes
SVD model size: 35381999 bytes
