In [60]:
import random
import numpy as np
import pandas as pd
from typing import *
from IPython.display import display, HTML, Markdown
import warnings
from scipy.linalg import svd
import os
from scipy.linalg import sqrtm

warnings.filterwarnings('ignore')


In [2]:
os.path.join(os.getcwd(),'ml-100k')

'/Users/prashant.singh/Documents/Personal Projects/Recommendation Engine/3. SVD/ml-100k'

## Data Loading

In [17]:
ratings_data = pd.read_csv(
    'ml-100k/u.data', sep='\t', encoding="ISO-8859-1",
    names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings_data = ratings_data[['user_id', 'movie_id', 'rating']]
movie_data = pd.read_csv(
    'ml-100k/u.item', sep='|', encoding="ISO-8859-1",
    names= [
        'movie_id', 'title', 'release_date', 'video_release_date', 'url', 
        'unknown', 'Action', 'Adventure', 'Animation', "Children's",
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
        'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
)
movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
ratings_data['user_id'] = ratings_data['user_id'].map(lambda k: f"User {k}")
ratings_and_movies = ratings_data.set_index('movie_id').join(movie_data['title']).reset_index(drop=True)
ratings_and_movies['movie_title'] = ratings_and_movies['title']
movielens_df = ratings_and_movies[['user_id', 'movie_title', 'rating']].sample(frac=1) 

In [18]:
# Remove movies with few ratings
movielens_df = movielens_df[movielens_df.groupby('movie_title')['user_id'].transform('count') > 50]
movielens_df.head()

Unnamed: 0,user_id,movie_title,rating
19668,User 492,D3: The Mighty Ducks (1996),4
28275,User 465,Amadeus (1984),4
53657,User 497,"Assignment, The (1997)",4
83615,User 621,"Replacement Killers, The (1998)",4
56032,User 746,"Little Rascals, The (1994)",3


In [20]:
movielens_df

Unnamed: 0,user_id,movie_title,rating
19668,User 492,D3: The Mighty Ducks (1996),4
28275,User 465,Amadeus (1984),4
53657,User 497,"Assignment, The (1997)",4
83615,User 621,"Replacement Killers, The (1998)",4
56032,User 746,"Little Rascals, The (1994)",3
...,...,...,...
8991,User 405,Ace Ventura: Pet Detective (1994),5
1640,User 316,Richard III (1995),4
73569,User 890,It Happened One Night (1934),5
21174,User 532,Sleeper (1973),5


## SVD Training with Surprise

In [25]:
matrix_df = movielens_df.groupby(['user_id', 'movie_title'])['rating'].mean().unstack()
matrix_df.head()

movie_title,12 Angry Men (1957),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Across the Sea of Time (1995),Addams Family Values (1993),...,Wings of Desire (1987),"Wings of the Dove, The (1997)","Winter Guest, The (1997)","Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),"Young Poisoner's Handbook, The (1995)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User 1,5.0,1.0,4.0,,,,4.0,4.0,,,...,,,,1.0,,5.0,,5.0,1.0,
User 10,,,5.0,,,,,,,4.0,...,4.0,,,,,4.0,,,,
User 100,,,,5.0,,,,,,,...,,3.0,4.0,,,,,,,
User 101,,,,,,,,,3.0,,...,,,,,,,,,,4.0
User 102,,,,3.0,,,2.0,3.0,,3.0,...,3.0,,,,2.0,3.0,,,2.0,3.0


In [34]:
data = Dataset.load_from_df(movielens_df[['user_id', 'movie_title', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

### Training 

In [36]:
# define model
# model = NMF(n_factors=100)
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x147c9d910>

In [37]:
print('Train Size: ', trainset.n_items)
print('Model Output Size Size: ', model.qi.shape)

# Normalization
print(pd.DataFrame(model.qi).iloc[0].pow(2).sum())
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
print(pd.DataFrame(model.qi).iloc[0].pow(2).sum())

Train Size:  588
Model Output Size Size:  (588, 100)
3.665076071419438
0.9999999999999998


### Recommendation via product

In [56]:
def get_vector_by_movie_title(movie_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]

def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

# fetch vector (100,) of each movie
toy_story_vec = get_vector_by_movie_title('Wolf (1994)', model)
wizard_of_oz_vec = get_vector_by_movie_title('Wizard of Oz, The (1939)', model)
similarity_score = cosine_distance(toy_story_vec, wizard_of_oz_vec)
similarity_score

0.9691716604226916

In [57]:
# Predict a score between any combination of user and a product
a_user = "User 196"
a_product = "Toy Story (1995)"
model.predict(a_user, a_product)

Prediction(uid='User 196', iid='Toy Story (1995)', r_ui=None, est=3.653245527140773, details={'was_impossible': False})

## SVD From Scratch

In [87]:
def create_utility_matrix(data):

    userList = data.iloc[:, 0].tolist()
    itemList = data.iloc[:, 1].tolist()
    valueList = data.iloc[:, 2].tolist()

    users = list(set(data.iloc[:, 0]))
    items = list(set(data.iloc[:, 1]))
    users_index = {users[i]: i for i in range(len(users))}
    X = data.groupby(['user_id', 'movie_title'])['rating'].mean().unstack()
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items
    return X, users_index, items_index

matrix, users_index, items_index = create_utility_matrix(
    movielens_df[['user_id', 'movie_title', 'rating']]
)
print({k: users_index[k] for k in list(users_index)[:5]})
print({k: items_index[k] for k in list(items_index)[:5]})
matrix.head()

{'User 156': 0, 'User 925': 1, 'User 751': 2, 'User 479': 3, 'User 912': 4}
{'12 Angry Men (1957)': 0, '20,000 Leagues Under the Sea (1954)': 1, '2001: A Space Odyssey (1968)': 2, '3 Ninjas: High Noon At Mega Mountain (1998)': 3, '39 Steps, The (1935)': 4}


movie_title,12 Angry Men (1957),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Across the Sea of Time (1995),Addams Family Values (1993),...,Wings of Desire (1987),"Wings of the Dove, The (1997)","Winter Guest, The (1997)","Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),"Young Poisoner's Handbook, The (1995)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User 1,5.0,1.0,4.0,,,,4.0,4.0,,,...,,,,1.0,,5.0,,5.0,1.0,
User 10,,,5.0,,,,,,,4.0,...,4.0,,,,,4.0,,,,
User 100,,,,5.0,,,,,,,...,,3.0,4.0,,,,,,,
User 101,,,,,,,,,3.0,,...,,,,,,,,,,4.0
User 102,,,,3.0,,,2.0,3.0,,3.0,...,3.0,,,,2.0,3.0,,,2.0,3.0


In [88]:
def svd(train, k=10):

    # mask nan
    # (user_len, item_len)
    utilMat = np.array(train)
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)

    # (item_len, )
    item_means = np.mean(masked_arr, axis=0)

    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)

    # repeat n times
    # (user_len, item_len)
    x = np.tile(item_means, (utilMat.shape[0], 1))

    # remove the per item average from all entries.
    # nan entries will be essentially zero now
    utilMat = utilMat - x

    # U and V are user and item features
    # (user_len, item_len), (item_len, ), (item_len, item_len)
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    # (item_len, item_len)
    s = np.diag(s)

    # we take only the k most significant features
    # (user_len, k), (k, k),  (k, item_len)
    U = U[:, 0:k]
    s = s[0:k, 0:k]
    V = V[0:k, :]
    s_root = sqrtm(s)

    # (user_len, k), (k, item_len),  (user_len, item_len)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)

    # (user_len, item_len)
    UsV = UsV + x
    return UsV

svdout = svd(matrix, k=8)
svdout.shape
pd.DataFrame(svdout)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,578,579,580,581,582,583,584,585,586,587
0,4.199733,2.929045,4.468040,4.366316,3.858861,3.214125,3.611128,3.270183,2.576072,3.614184,...,4.604247,3.752501,3.126952,3.662570,3.311182,4.767485,2.669598,3.990083,2.529174,2.964891
1,4.026639,3.266965,4.554342,4.245449,3.823679,3.185469,3.623345,3.532759,2.889405,3.562327,...,4.468479,3.577183,3.187155,3.917031,3.820195,4.326858,2.734663,3.851587,2.803854,3.159952
2,3.903553,3.221726,4.276884,4.220665,3.745596,3.118609,3.555333,3.546616,2.918380,3.580010,...,4.264758,3.585284,3.201162,3.703869,3.646839,4.091474,2.694848,3.829535,2.666061,3.042608
3,3.858623,3.213532,4.260816,4.231564,3.738413,3.089159,3.542118,3.569792,2.876407,3.599563,...,4.211298,3.592918,3.208329,3.731005,3.733454,3.947064,2.701265,3.806736,2.695573,3.062678
4,3.561750,3.077233,4.198610,3.924273,3.635111,2.987294,3.292013,3.265913,2.735502,2.669129,...,4.104190,3.599319,3.228967,3.792095,3.001013,3.372608,2.319125,3.685881,2.282373,2.478176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,3.649053,3.136354,4.156348,4.428685,3.779579,3.149519,3.548811,3.783714,2.884154,3.612817,...,4.144141,3.768066,3.288147,3.687041,3.696049,3.845026,2.595790,3.753150,2.649559,2.962217
939,3.894031,3.203647,4.402085,4.257788,3.777984,3.155898,3.583413,3.544050,2.873601,3.550735,...,4.288641,3.603784,3.175904,3.765709,3.727341,4.140450,2.679524,3.753377,2.735549,3.154864
940,3.975906,3.159790,4.319091,4.294317,3.781502,3.141206,3.587373,3.544522,2.829524,3.714841,...,4.316444,3.631978,3.154540,3.693199,3.759035,4.174863,2.713421,3.819180,2.709881,3.184018
941,3.877963,3.236000,4.317508,4.217914,3.738214,3.111209,3.532754,3.521595,2.920837,3.507734,...,4.253917,3.599638,3.216939,3.772984,3.664391,4.000890,2.687154,3.815453,2.678871,3.087809


## Inference

In [89]:
# predictions
pred = []
for _,row in movielens_df.iterrows():
    user = row['user_id']
    item = row['movie_title']
    u_index = users_index[user]

    if item in items_index:
        i_index = items_index[item]
        pred_rating = svdout[u_index, i_index]
    else:
        pred_rating = np.mean(svdout[u_index, :])
    pred.append(pred_rating)

# evaluation
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

print(rmse(movielens_df['rating'], pred))
movielens_df['pred'] = pred
movielens_df

1.0328812399892506


Unnamed: 0,user_id,movie_title,rating,pred
19668,User 492,D3: The Mighty Ducks (1996),4,3.841106
28275,User 465,Amadeus (1984),4,4.123113
53657,User 497,"Assignment, The (1997)",4,2.294833
83615,User 621,"Replacement Killers, The (1998)",4,3.424687
56032,User 746,"Little Rascals, The (1994)",3,3.303762
...,...,...,...,...
8991,User 405,Ace Ventura: Pet Detective (1994),5,3.647040
1640,User 316,Richard III (1995),4,3.678893
73569,User 890,It Happened One Night (1934),5,4.303121
21174,User 532,Sleeper (1973),5,3.510022
