# Loading the data given for the assignment

In [3]:
!wget "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

--2023-12-03 14:12:35--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-12-03 14:12:35 (11.2 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [4]:
!unzip "ml-100k.zip"

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


# Import needed for the notebook

In [313]:
# Operationsal
import os 

# Data manipulation
import numpy as np
import pandas as pd

# Surprise lib
## Model
from surprise import SVD
## Loading the data
from surprise import Reader, Dataset
## Splitting the data
from surprise.model_selection import train_test_split
## Calculating the metrics
from surprise import accuracy
## Saving the model
from surprise import dump

# Visualizations
import matplotlib.pyplot as plt

# Manual Seeding

In [314]:
# Manual seeding
np.random.seed(42)

# Data reading and preparing

In [216]:
data = pd.read_csv("ml-100k/u.data", header=None, sep='\t',
                   names=["user_id", "movie_id", "rating", "time"])
data

Unnamed: 0,user_id,movie_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [217]:
# Movies
mov = pd.read_csv("ml-100k/u.item", header=None, sep='|',  encoding='latin-1',
                  names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL'], usecols=range(5))

In [218]:
data = data.merge(mov[['movie_id', 'movie_title']], on='movie_id', how='left')

In [219]:
data

Unnamed: 0,user_id,movie_id,rating,time,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)
...,...,...,...,...,...
99995,880,476,3,880175444,"First Wives Club, The (1996)"
99996,716,204,5,879795543,Back to the Future (1985)
99997,276,1090,1,874795795,Sliver (1993)
99998,13,225,2,882399156,101 Dalmatians (1996)


In [315]:
# For visual
data['data'] = pd.to_datetime(data['time'], unit='s')

# Saving the parts of the data

In [257]:
data[['user_id', 'movie_title', 'rating']].to_csv('full_data.csv', sep='\t', index=False)


In [261]:
train_set_size = 0.8
train_set = data[['user_id', 'movie_title', 'rating']].iloc[:int(0.8 * len(data)),:]
test_set = data[['user_id', 'movie_title', 'rating']].iloc[int(0.8 * len(data)):,:]

In [280]:
# tst = Dataset.load_from_df(test_set, reader)
train_set.to_csv('train_data.csv', sep='\t', index=False)
test_set.to_csv('test_data.csv', sep='\t', index=False)

In [286]:
test_set

Unnamed: 0,user_id,movie_title,rating
80000,863,Legal Deceit (1997),4
80001,761,Ed (1996),1
80002,863,Murder at 1600 (1997),1
80003,828,Persuasion (1995),2
80004,889,Cool Hand Luke (1967),4
...,...,...,...
99995,880,"First Wives Club, The (1996)",3
99996,716,Back to the Future (1985),5
99997,276,Sliver (1993),1
99998,13,101 Dalmatians (1996),2


# Prepare data for surpise models

In [133]:
reader = Reader(rating_scale=(1, 5))
sup_data = Dataset.load_from_df(data[['user_id', 'movie_title', 'rating']], reader)
train_data, test_data = train_test_split(sup_data, test_size=0.2, random_state=42)

In [318]:
# SVD model initialization and putting the random state
model = SVD(n_factors = 200 , lr_all = 0.005 , reg_all = 0.02 ,
            n_epochs = 40 , init_std_dev = 0.05, random_state=42)

In [319]:
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c4e8fa5a110>

In [320]:
preds = model.test(test_data, verbose=0)
# preds = model.test(train_data.build_testset(), verbose=0)

In [321]:
accuracy.rmse(preds)

RMSE: 0.9229


0.9229315278794352

In [322]:
accuracy.mae(preds)

MAE:  0.7243


0.7242781070399784

In [323]:
accuracy.mse(preds)

MSE: 0.8518


0.8518026051538687

# Prediction function

In [290]:
def prediction_algo(uid, model, num=5):
    predictions = []
    for ii in train_data.all_items():
        ii = train_data.to_raw_iid(ii)
        predictions.append(model.predict(uid, ii, verbose = False))
    print(predictions[:num])
    return [x.iid for x in sorted(predictions, key=lambda x: x.est, reverse=True)[:num]]

In [324]:
prediction_algo(900, model)

[Prediction(uid=900, iid="Muriel's Wedding (1994)", r_ui=None, est=2.4964039137549716, details={'was_impossible': False}), Prediction(uid=900, iid='American in Paris, An (1951)', r_ui=None, est=2.048820590551575, details={'was_impossible': False}), Prediction(uid=900, iid='Highlander (1986)', r_ui=None, est=2.509882638001912, details={'was_impossible': False}), Prediction(uid=900, iid="She's So Lovely (1997)", r_ui=None, est=1.735422189917371, details={'was_impossible': False}), Prediction(uid=900, iid='GoodFellas (1990)', r_ui=None, est=2.9025719669474452, details={'was_impossible': False})]


['Casablanca (1942)',
 'Usual Suspects, The (1995)',
 "Schindler's List (1993)",
 'Fargo (1996)',
 'Wild Bunch, The (1969)']

# Saving the model

In [300]:
file_name = os.path.expanduser('./dump_file')
dump.dump(file_name, algo=model)