# Unsupervised Learning Predict:
## Movie Recommendation Challenge

Load packages

In [37]:
import pandas as pd
import numpy as np

In [38]:
# Modeling packages
from surprise import Reader
from surprise import Dataset
from surprise import KNNWithMeans 
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
import heapq

### Load and explore datasets

In [39]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [40]:
train.shape

(10000038, 4)

In [41]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [42]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,Id,rating
0,1_2011,1.0
1,1_4144,1.0
2,1_5767,1.0
3,1_6711,1.0
4,1_7318,1.0


In [80]:
sample_submission.tail()

Unnamed: 0,Id,rating
5000014,162541_4079,1.0
5000015,162541_4467,1.0
5000016,162541_4980,1.0
5000017,162541_5689,1.0
5000018,162541_7153,1.0


In [43]:
tags = pd.read_csv('tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [44]:
genome_scores = pd.read_csv('genome_scores.csv')
genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [45]:
genome_tags = pd.read_csv('genome_tags.csv')
genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [46]:
imdb_data = pd.read_csv('imdb_data.csv')
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [47]:
links = pd.read_csv('links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [48]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [49]:
print(tags.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1093360 non-null  int64 
 1   movieId    1093360 non-null  int64 
 2   tag        1093344 non-null  object
 3   timestamp  1093360 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB
None


In [50]:
print(genome_scores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15584448 entries, 0 to 15584447
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 356.7 MB
None


In [51]:
print(genome_tags.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB
None


In [52]:
print(imdb_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        27278 non-null  int64  
 1   title_cast     17210 non-null  object 
 2   director       17404 non-null  object 
 3   runtime        15189 non-null  float64
 4   budget         7906 non-null   object 
 5   plot_keywords  16200 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.2+ MB
None


In [53]:
print(links.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62423 non-null  int64  
 1   imdbId   62423 non-null  int64  
 2   tmdbId   62316 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB
None


In [54]:
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None


### Build utility matrix, merging relevant dataframes

In [55]:
df_1 = train.merge(movies, on='movieId')
df_1.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,87388,57669,3.5,1237455297,In Bruges (2008),Comedy|Crime|Drama|Thriller
2,137050,57669,4.0,1425631854,In Bruges (2008),Comedy|Crime|Drama|Thriller


In [56]:
df_2 = df_1.merge(imdb_data, on='movieId')

In [57]:
df_2.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_cast,director,runtime,budget,plot_keywords
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,107.0,"$15,000,000",dwarf|bruges|irish|hitman
1,87388,57669,3.5,1237455297,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,107.0,"$15,000,000",dwarf|bruges|irish|hitman
2,137050,57669,4.0,1425631854,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,107.0,"$15,000,000",dwarf|bruges|irish|hitman


In [58]:
df_1.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [59]:
df_2.plot_keywords

0                              dwarf|bruges|irish|hitman
1                              dwarf|bruges|irish|hitman
2                              dwarf|bruges|irish|hitman
3                              dwarf|bruges|irish|hitman
4                              dwarf|bruges|irish|hitman
                               ...                      
9633026                                              NaN
9633027    janitor|income|housing problem|social problem
9633028                                              NaN
9633029                   soldier|italy|partisan|fascist
9633030           genius|artist|greek american|direction
Name: plot_keywords, Length: 9633031, dtype: object

In [60]:
train['rating'].describe()

count    1.000004e+07
mean     3.533395e+00
std      1.061124e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [61]:
df_2.isnull().sum()

userId                 0
movieId                0
rating                 0
timestamp              0
title                  0
genres                 0
title_cast       2604407
director         2602688
runtime          2653058
budget           3152276
plot_keywords    2610043
dtype: int64

In [62]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


### Build movie recommender function, rating independent movie_id by user_id

In [63]:
# Create small test dataframe to evaluate models
tests = train.copy()
tests.drop(['timestamp'], axis=1, inplace=True)
tests = tests.head(20000)

# Create training data
reader = Reader(rating_scale=(0.5, 5))
test_data = Dataset.load_from_df(tests[['userId', 'movieId', 'rating']], reader)

# Compute similarities between users using cosine distance
sim_options = {'name': 'cosine',
              'user_based': True}

# Evaluate model
user = KNNWithMeans(sim_options=sim_options)
cv = cross_validate(user, test_data, cv=5, measures=['RMSE'], verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1331  1.1285  1.1267  1.1143  1.1113  1.1228  0.0085  
Fit time          4.73    3.57    3.56    2.58    2.97    3.48    0.73    
Test time         0.05    0.04    0.04    0.03    0.04    0.04    0.01    


In [64]:
# Compute similarities between items using cosine distance
sim_options = {'name': 'cosine',
              'user_based': False}

# Fit KNNwithmeans algorithm to training set
item_based = KNNWithMeans(sim_options=sim_options)

# Evaluate model
cv = cross_validate(item_based, test_data, cv=5, measures=['RMSE'], verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0848  1.0837  1.0876  1.0892  1.0801  1.0851  0.0032  
Fit time          0.21    0.21    0.21    0.20    0.22    0.21    0.01    
Test time         0.03    0.03    0.02    0.03    0.02    0.03    0.00    


In [65]:
# Load Surprise dataframe
train_df = train.copy()
reader = Reader()

#Select data for model training
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

In [66]:
# Split data
trainset, testset = train_test_split(data, test_size=0.2)

In [67]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000038 entries, 0 to 10000037
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 305.2 MB


In [68]:
# create instance for algorithm
svd = SVD()

# Fit to trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17bcc9e0160>

In [69]:
pred = svd.test(testset)
print(pred)

test_df = pd.DataFrame(pred)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [70]:
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [71]:
ratings_predictions = [svd.predict(row.userId, row.movieId) for _, row in test.iterrows()]
ratings_predictions

[Prediction(uid=1, iid=2011, r_ui=None, est=3.3336432227249486, details={'was_impossible': False}),
 Prediction(uid=1, iid=4144, r_ui=None, est=4.0439755930557775, details={'was_impossible': False}),
 Prediction(uid=1, iid=5767, r_ui=None, est=3.7849049130684884, details={'was_impossible': False}),
 Prediction(uid=1, iid=6711, r_ui=None, est=3.7392975065996343, details={'was_impossible': False}),
 Prediction(uid=1, iid=7318, r_ui=None, est=3.4587777683779644, details={'was_impossible': False}),
 Prediction(uid=1, iid=8405, r_ui=None, est=3.9892293089623756, details={'was_impossible': False}),
 Prediction(uid=1, iid=8786, r_ui=None, est=4.2787694014716005, details={'was_impossible': False}),
 Prediction(uid=2, iid=150, r_ui=None, est=3.5450845613072675, details={'was_impossible': False}),
 Prediction(uid=2, iid=356, r_ui=None, est=3.6806222096162178, details={'was_impossible': False}),
 Prediction(uid=2, iid=497, r_ui=None, est=3.4025332173570386, details={'was_impossible': False}),
 Pr

In [72]:
pred_df = pd.DataFrame(ratings_predictions)
pred_df

Unnamed: 0,uid,iid,r_ui,est,details
0,1,2011,,3.333643,{'was_impossible': False}
1,1,4144,,4.043976,{'was_impossible': False}
2,1,5767,,3.784905,{'was_impossible': False}
3,1,6711,,3.739298,{'was_impossible': False}
4,1,7318,,3.458778,{'was_impossible': False}
...,...,...,...,...,...
5000014,162541,4079,,3.381690,{'was_impossible': False}
5000015,162541,4467,,3.907179,{'was_impossible': False}
5000016,162541,4980,,2.554581,{'was_impossible': False}
5000017,162541,5689,,2.785085,{'was_impossible': False}


### Get a rating for every user-movie pair in test dataframe and generate csv submission

In [73]:
# Rename predictions to original names
pred_df = pred_df.rename(columns={'uid':'userId', 'iid':'movieId','est':'rating'})
pred_df.drop(['r_ui','details'], axis=1, inplace=True)

In [74]:
pred_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2011,3.333643
1,1,4144,4.043976
2,1,5767,3.784905
3,1,6711,3.739298
4,1,7318,3.458778


In [75]:
# Join userId and movieId into single Id column
pred_df['Id']=pred_df.apply(lambda x:'%s_%s' % (x['userId'], x['movieId']), axis=1)
pred_df['Id']=pred_df.apply(lambda x:'%s_%s' % (x['userId'], x['movieId']), axis=1)

In [76]:
pred_df.head()

Unnamed: 0,userId,movieId,rating,Id
0,1,2011,3.333643,1_2011
1,1,4144,4.043976,1_4144
2,1,5767,3.784905,1_5767
3,1,6711,3.739298,1_6711
4,1,7318,3.458778,1_7318


In [77]:
pred_df.drop(['userId', 'movieId'], inplace=True, axis=1)

In [78]:
pred_df = pred_df[['Id', 'rating']]
pred_df.head()

Unnamed: 0,Id,rating
0,1_2011,3.333643
1,1_4144,4.043976
2,1_5767,3.784905
3,1_6711,3.739298
4,1_7318,3.458778


In [82]:
pred_df['rating'] = pred_df['rating'].apply(lambda x: round(x * 2) / 2)

In [83]:
pred_df.head()

Unnamed: 0,Id,rating
0,1_2011,3.5
1,1_4144,4.0
2,1_5767,4.0
3,1_6711,3.5
4,1_7318,3.5


In [84]:
pred_df.to_csv('Shawn_Johnson_submission.csv', index=False)

In [86]:
pred_df.shape

(5000019, 2)