### Movie Recommendation

Input Features: [userId, moveId] <br>
Target Feature: rating <br>
Objective: Predict how a user would rate a particular movie<br>
<h4>Movie Lens Overview: https://grouplens.org/datasets/movielens/</h4>
<h4>Dataset: http://files.grouplens.org/datasets/movielens/ml-latest-small.zip</h4>
<h4>F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. </h4>
    


In [59]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.datasets import dump_svmlight_file

import sagemaker.amazon.common as smac

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### Download movie dataset

In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2024-05-13 12:27:30--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-05-13 12:27:31 (2.69 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [4]:
!ls

ml-latest-small.zip  movie_data_preparation.ipynb


In [5]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
df_movies =pd.read_csv(r'ml-latest-small/movies.csv')

In [4]:
df_movies.shape

(9742, 3)

In [5]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
genere_list = df_movies['genres'].map(lambda x: x.split('|'))

In [7]:
genere_list[:5]

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
Name: genres, dtype: object

In [8]:
def get_unique_genres(genre_list):
    
    unique_list = set()
    
    for items in genre_list:
        for item in items:
            unique_list.add(item)
            
    return sorted(unique_list)

In [9]:
get_unique_genres(genere_list)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [10]:
genre = get_unique_genres(genere_list)

In [11]:
len(genre)

20

In [12]:
# Table of genre

df_genre = pd.DataFrame(index=range(df_movies.shape[0]),columns=genre)

In [13]:
df_genre.shape

(9742, 20)

In [14]:
df_genre.head(2)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,


In [15]:
df_genre = df_genre.fillna(0)
df_genre.head(2)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
genere_list[:5]

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
Name: genres, dtype: object

In [17]:
gen_enum = enumerate(genere_list)

In [24]:
for row,element in enumerate(genere_list):
    print(row,element)
    if row > 5:
        break
    

0 ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']
1 ['Adventure', 'Children', 'Fantasy']
2 ['Comedy', 'Romance']
3 ['Comedy', 'Drama', 'Romance']
4 ['Comedy']
5 ['Action', 'Crime', 'Thriller']
6 ['Comedy', 'Romance']


In [41]:
print(df_genre.loc[0,['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']])

Adventure    0
Animation    0
Children     0
Comedy       0
Fantasy      0
Name: 0, dtype: int64


In [42]:
print(df_genre.loc[1,['Adventure', 'Children', 'Fantasy']])

Adventure    0
Children     0
Fantasy      0
Name: 1, dtype: int64


In [25]:
for row, element in enumerate(genere_list):
    df_genre.loc[row,element]=1

In [26]:
df_genre.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
df_genre[df_genre['(no genres listed)']>0].head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8684,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8687,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8782,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8836,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
# merge movies with genre hotcoding
df_movies = df_movies.join(df_genre)

In [28]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_movies.to_csv(r'ml-latest-small/movies_genre.csv')

In [29]:
df_ratings = pd.read_csv(r'ml-latest-small/ratings.csv')

In [30]:
df_ratings.shape

(100836, 4)

In [31]:
df_ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [32]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [61]:
df_ratings['userId'].unique().shape[0]

610

In [62]:
df_ratings['movieId'].unique().shape[0]

9724

In [33]:
df_ratings.drop(columns=['timestamp'],inplace=True,axis=1)

In [64]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [34]:
# Merge rating and movies description
df_movie_ratings = df_ratings.merge(df_movies, on='movieId')

In [35]:
df_movie_ratings.shape

(100836, 25)

In [74]:
df_movie_ratings.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [75]:
df_movie_ratings.tail(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100835,610,163981,3.5,31 (2016),Horror,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


#### Training and Validation dataset

In [36]:
np.random.seed(5)
l = list(df_movie_ratings.index)
np.random.shuffle(l)
df=df_movie_ratings.iloc[l]

In [81]:
df_movie_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
92163,298,42011,1.0,Fun with Dick and Jane (2005),Comedy|Crime,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71427,28,428,2.5,"Bronx Tale, A (1993)",Drama,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
993,372,110,4.0,Braveheart (1995),Action|Drama|War,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6514,303,1097,5.0,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
6011,19,1073,4.0,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [37]:
rows = df_movie_ratings.shape[0]
train = int(rows*0.7)
validation = rows-train
rows,train,validation

(100836, 70585, 30251)

In [84]:
70585+30251

100836

In [38]:
# SageMaker Factorization Machine expects all columns to be of float32
# Let's get the target variable as float32

y = df_movie_ratings['rating'].astype(np.float32).ravel()

In [39]:
len(y)

100836

In [40]:
y

array([4. , 4. , 4.5, ..., 3. , 3.5, 3.5], dtype=float32)

In [42]:
# we create two different training datasets
# Training 1: rating, userid, movieid
# Training 2: rating, userid, movieid and movie genre attributes
columns_user_movie = ['userId', 'movieId']
columns_all = columns_user_movie + genre

In [43]:
columns_user_movie

['userId', 'movieId']

In [44]:
columns_all

['userId',
 'movieId',
 '(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [45]:
# storage a copy of userid, movieid and rating
# Train test
df[['rating','userId','movieId']][:train].to_csv(r'ml-latest-small/user_movie_train.csv',index=False)
df[['rating','userId','movieId']][train:].to_csv(r'ml-latest-small/user_movie_test.csv',index=False)

In [46]:
# One hot Encode
# Training 1: userid, movieid
# Training 2: userid,movieid and movie genre attributes

encoder = preprocessing.OneHotEncoder(dtype=np.float32)

In [47]:
df[columns_user_movie].head(2)

Unnamed: 0,userId,movieId
92163,298,42011
71427,28,428


In [48]:
X = encoder.fit_transform(df[columns_user_movie])

In [49]:
X

<100836x10334 sparse matrix of type '<class 'numpy.float32'>'
	with 201672 stored elements in Compressed Sparse Row format>

In [52]:
df_X = pd.DataFrame(X)

In [53]:
df_X

Unnamed: 0,0
0,"(0, 297)\t1.0\n (0, 6685)\t1.0"
1,"(0, 27)\t1.0\n (0, 982)\t1.0"
2,"(0, 371)\t1.0\n (0, 707)\t1.0"
3,"(0, 302)\t1.0\n (0, 1445)\t1.0"
4,"(0, 18)\t1.0\n (0, 1425)\t1.0"
...,...
100831,"(0, 599)\t1.0\n (0, 1694)\t1.0"
100832,"(0, 569)\t1.0\n (0, 1225)\t1.0"
100833,"(0, 314)\t1.0\n (0, 1294)\t1.0"
100834,"(0, 488)\t1.0\n (0, 2433)\t1.0"


In [55]:
X.shape

(100836, 10334)

In [63]:
# Create a spare matrix recordio file
def write_sparse_recordio_file(filename, x, y=None):
    with open(filename,'wb') as f:
        smac.write_spmatrix_to_sparse_tensor(f,x,y)

In [64]:
# Training recordIO file
write_sparse_recordio_file(r'ml-latest-small/user_movie_train.recordio',X[:train],y[:train])

In [65]:
# Test recordio file
write_sparse_recordio_file(r'ml-latest-small/user_movie_test.recordio',X[train:],y[train:])

In [66]:
# Create libSVM formatted file. Convenient text format
# Output is stored as rating, user_index:value, movie_index:value
#  For example: 5.0 314:1 215:1  (user with index 314 and movie with index 215 in the one hot encoded table has a rating of 5 )

# This file can be used for two purposes: 
#   1. directly traing with libFM binary in local mode
#   2. It is easy to run inference with this format against sagemaker cloud as we need to
#      send only sparse input to sagemaker prediction service

# 
# Store in libSVM format as well for directly testing with libFM

dump_svmlight_file(X[:train],y[:train],r'ml-latest-small/user_movie_train.svm')
dump_svmlight_file(X[train:],y[train:],r'ml-latest-small/user_movie_test.svm')

In [74]:
X[:train].get_shape

<bound method spmatrix.get_shape of <70585x10334 sparse matrix of type '<class 'numpy.float32'>'
	with 141170 stored elements in Compressed Sparse Row format>>

In [77]:
70585*2 #number of elements as two columns

141170

In [78]:
X[train:].get_shape

<bound method spmatrix.get_shape of <30251x10334 sparse matrix of type '<class 'numpy.float32'>'
	with 60502 stored elements in Compressed Sparse Row format>>

In [79]:
30251*2 #number of elements as two columns

60502

In [80]:
# Create two lookup files
# File 1: Categorical Movie ID and corresponding Movie Index in One Hot Encoded Table
# File 2: Categorical User ID and corresponding User Index in One Hot Encoded Table

# This is useful for predicting how a particular user would rate all the movies
# or all users rating one particular movie

In [81]:
df.movieId.unique()

array([ 42011,    428,    110, ..., 191005, 117572,   4434])

In [83]:
df.userId.unique()[:5]

array([298,  28, 372, 303,  19])

In [125]:
list_of_movies = df.movieId.unique()

#user 1 and all movies
df_user_movie = pd.DataFrame({'userId': np.full(len(list_of_movies),1), 'movieId':list_of_movies})

In [126]:
df_user_movie

Unnamed: 0,userId,movieId
0,1,42011
1,1,428
2,1,110
3,1,1097
4,1,1073
...,...,...
9719,1,5028
9720,1,8713
9721,1,191005
9722,1,117572


In [127]:
df_user_movie[columns_user_movie]

Unnamed: 0,userId,movieId
0,1,42011
1,1,428
2,1,110
3,1,1097
4,1,1073
...,...,...
9719,1,5028
9720,1,8713
9721,1,191005
9722,1,117572


In [88]:
len(list_of_movies)

9724

In [89]:
list_of_movies

array([ 42011,    428,    110, ..., 191005, 117572,   4434])

In [102]:
columns_user_movie

['userId', 'movieId']

In [104]:
# transform to one hot encoding (with existing encoder)
X = encoder.transform(df_user_movie[columns_user_movie])
#X = encoder.transform(df_user_movie)

In [106]:
X

<9724x10334 sparse matrix of type '<class 'numpy.float32'>'
	with 19448 stored elements in Compressed Sparse Row format>

In [108]:
# Store movieId and corresponding one hot encoded entries
dump_svmlight_file(X,list_of_movies,r'ml-latest-small/one_hot_enc_movies.svm')

In [103]:
df_user_movie

Unnamed: 0,userId,movieId
0,1,42011
1,1,428
2,1,110
3,1,1097
4,1,1073
...,...,...
9719,1,5028
9720,1,8713
9721,1,191005
9722,1,117572


In [109]:
# Now create
# file : CAtegorical UserID and corresponding userIndex in one Hot Encoded Table

In [115]:
list_of_users = df.userId.unique()

In [116]:
list_of_users.shape

(610,)

In [117]:
list_of_users[:10]

array([298,  28, 372, 303,  19, 487, 332, 165,  89, 288])

In [118]:
list_of_movies[:10]

array([42011,   428,   110,  1097,  1073, 81845,  2542,  5485,  3101,
       92259])

In [119]:
# All users and movie 1
df_user_movie = pd.DataFrame({'userId': list_of_users, 'movieId':np.full(len(list_of_users),1)})

In [120]:
df_user_movie.shape

(610, 2)

In [121]:
df_user_movie

Unnamed: 0,userId,movieId
0,298,1
1,28,1
2,372,1
3,303,1
4,19,1
...,...,...
605,394,1
606,439,1
607,120,1
608,549,1


In [122]:
# Transform to one hot encoding (with existing encoder)
X = encoder.transform(df_user_movie[columns_user_movie])

In [123]:
# store movieId and corresponding one hot encoding entries
dump_svmlight_file(X, list_of_users,r'ml-latest-small/one_hot_enc_users.svm')