In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt  
from pathlib import Path
import re
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from pathlib import Path
from surprise.model_selection import cross_validate

In [2]:
PATH = Path('../../')

In [3]:
list(PATH.iterdir())

[WindowsPath('../../.git'),
 WindowsPath('../../.vscode'),
 WindowsPath('../../code'),
 WindowsPath('../../data'),
 WindowsPath('../../figures'),
 WindowsPath('../../LICENSE'),
 WindowsPath('../../pip_requirements.txt'),
 WindowsPath('../../products'),
 WindowsPath('../../README.md'),
 WindowsPath('../../requirements.txt'),
 WindowsPath('../../wandb')]

In [4]:
DATA= PATH/'data'
CODE= PATH/'code'
FIGURES=PATH/'figures'
PRODUCTS=PATH/'products'
WORKING_DATA= DATA/'working_data'

In [5]:
ratings = pd.read_csv(WORKING_DATA/'ratings_small.csv')
links = pd.read_csv(WORKING_DATA/'links_small.csv')
metadata = pd.read_csv(WORKING_DATA/'movies_metadata.csv')

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9125 non-null   int64  
 1   imdbId   9125 non-null   int64  
 2   tmdbId   9112 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 214.0 KB


In [8]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [10]:
metadata.rename(columns={'id':'tmdbId'},inplace=True)

In [11]:
# Percentage of null values in each columns
100*ratings.isnull().sum()/len(ratings)

userId       0.0
movieId      0.0
rating       0.0
timestamp    0.0
dtype: float64

In [12]:
# Percentage of null values in each columns
100*links.isnull().sum()/len(links)

movieId    0.000000
imdbId     0.000000
tmdbId     0.142466
dtype: float64

In [13]:
100*metadata.isnull().sum()/len(metadata)

adult                     0.000000
belongs_to_collection    90.115691
budget                    0.000000
genres                    0.000000
homepage                 82.883913
tmdbId                    0.000000
imdb_id                   0.037391
original_language         0.024194
original_title            0.000000
overview                  2.098271
popularity                0.010997
poster_path               0.848986
production_companies      0.006598
production_countries      0.006598
release_date              0.191352
revenue                   0.013197
runtime                   0.578454
spoken_languages          0.013197
status                    0.191352
tagline                  55.104914
title                     0.013197
video                     0.013197
vote_average              0.013197
vote_count                0.013197
dtype: float64

In [14]:
id_with_title = metadata[['tmdbId','original_title']]
#some tmdb id of date format YYYY-MM-DD which interferes with merge
#search for hyphenated id and drop
drop_ids=[]
for id_num in id_with_title['tmdbId']:
    x=  re.findall(r'\w+(?:-\w+)+',id_num )
    if x:
       drop_ids.append(id_num)
# drop movie id

for to_drop in drop_ids:
    id_with_title=id_with_title[id_with_title['tmdbId']!=to_drop]

id_with_title['tmdbId']=id_with_title['tmdbId'].astype(float)


In [15]:
id_with_title.head()

Unnamed: 0,tmdbId,original_title
0,862.0,Toy Story
1,8844.0,Jumanji
2,15602.0,Grumpier Old Men
3,31357.0,Waiting to Exhale
4,11862.0,Father of the Bride Part II


In [16]:
links=links.merge(id_with_title)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,original_title
0,1,114709,862.0,Toy Story
1,2,113497,8844.0,Jumanji
2,3,113228,15602.0,Grumpier Old Men
3,4,114885,31357.0,Waiting to Exhale
4,5,113041,11862.0,Father of the Bride Part II


In [17]:
ratings_with_links=ratings.merge(links)
ratings_with_links.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,original_title
0,1,31,2.5,1260759144,112792,9909.0,Dangerous Minds
1,7,31,3.0,851868750,112792,9909.0,Dangerous Minds
2,31,31,4.0,1273541953,112792,9909.0,Dangerous Minds
3,32,31,4.0,834828440,112792,9909.0,Dangerous Minds
4,36,31,3.0,847057202,112792,9909.0,Dangerous Minds


In [18]:
ratings_with_links.sort_values(by=['userId'],inplace=True)
ratings_with_links.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,original_title
0,1,31,2.5,1260759144,112792,9909.0,Dangerous Minds
849,1,3671,3.0,1260759117,71230,11072.0,Blazing Saddles
806,1,2968,1.0,1260759200,81633,36819.0,Time Bandits
759,1,2455,2.5,1260759113,91064,9426.0,The Fly
706,1,2294,2.0,1260759108,120587,8916.0,Antz


In [19]:
len(ratings)

100004

In [20]:
len(ratings_with_links)

99850

In [21]:
loss=len(ratings)-len(ratings_with_links)
print(f'lost {loss} ratings')

lost 154 ratings


In [22]:
100*loss/len(ratings)

0.15399384024639015

In [23]:
ratings_with_links.to_csv(WORKING_DATA/'ratings_with_links.csv', sep=',', encoding='utf-8',index=False)

In [24]:
links.to_csv(WORKING_DATA/'links_with_title.csv', sep=',', encoding='utf-8',index=False)

# check data using algorithms

In [28]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)

In [31]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8970  0.8999  0.8923  0.8967  0.9006  0.8973  0.0029  
MAE (testset)     0.6883  0.6897  0.6877  0.6916  0.6944  0.6903  0.0024  
Fit time          5.93    5.68    5.40    5.65    6.06    5.74    0.23    
Test time         0.16    0.22    0.25    0.16    0.20    0.20    0.04    


{'test_rmse': array([0.89698602, 0.89987477, 0.89227044, 0.89667965, 0.9005802 ]),
 'test_mae': array([0.68825351, 0.68970028, 0.68765273, 0.69156278, 0.6943923 ]),
 'fit_time': (5.928911209106445,
  5.67699408531189,
  5.396001815795898,
  5.647959470748901,
  6.060058116912842),
 'test_time': (0.15700149536132812,
  0.2219679355621338,
  0.25299715995788574,
  0.1560344696044922,
  0.20095014572143555)}

In [30]:
algo = SVDpp()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

TypeError: fit() takes exactly 2 positional arguments (1 given)