### Movie Recommendation

Input Features: [userId, moveId] <br>
Target Feature: rating <br>
Objective: Predict how a user would rate a particular movie<br>
<h4>Movie Lens Overview: https://grouplens.org/datasets/movielens/</h4>
<h4>Dataset: http://files.grouplens.org/datasets/movielens/ml-latest-small.zip</h4>
<h4>F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. </h4>
    


In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.datasets import dump_svmlight_file

#### Download movie dataset

In [3]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2024-05-13 00:38:50--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-05-13 00:38:50 (4.73 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [4]:
!ls

ml-latest-small.zip  movie_data_preparation.ipynb


In [5]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [6]:
df_movies =pd.read_csv(r'ml-latest-small/movies.csv')

In [7]:
df_movies.shape

(9742, 3)

In [9]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
genere_list = df_movies['genres'].map(lambda x: x.split('|'))

In [13]:
genere_list[:5]

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
Name: genres, dtype: object

In [14]:
def get_unique_genres(genre_list):
    
    unique_list = set()
    
    for items in genre_list:
        for item in items:
            unique_list.add(item)
            
    return sorted(unique_list)

In [15]:
get_unique_genres(genere_list)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [16]:
genre = get_unique_genres(genere_list)

In [17]:
len(genre)

20

In [22]:
# Table of genre

df_genre = pd.DataFrame(index=range(df_movies.shape[0]),columns=genre)

In [23]:
df_genre.shape

(9742, 20)

In [24]:
df_genre.head(2)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,


In [26]:
df_genre = df_genre.fillna(0)
df_genre.head(2)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
genere_list[:5]

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
Name: genres, dtype: object

In [32]:
gen_enum = enumerate(genere_list)

In [39]:
for row,element in enumerate(genere_list):
    print(df_genre.loc[row,element])
    

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']
['Adventure', 'Children', 'Fantasy']
['Comedy', 'Romance']
['Comedy', 'Drama', 'Romance']
['Comedy']
['Action', 'Crime', 'Thriller']
['Comedy', 'Romance']
['Adventure', 'Children']
['Action']
['Action', 'Adventure', 'Thriller']
['Comedy', 'Drama', 'Romance']
['Comedy', 'Horror']
['Adventure', 'Animation', 'Children']
['Drama']
['Action', 'Adventure', 'Romance']
['Crime', 'Drama']
['Drama', 'Romance']
['Comedy']
['Comedy']
['Action', 'Comedy', 'Crime', 'Drama', 'Thriller']
['Comedy', 'Crime', 'Thriller']
['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller']
['Action', 'Crime', 'Thriller']
['Drama', 'Sci-Fi']
['Drama', 'Romance']
['Drama']
['Children', 'Drama']
['Drama', 'Romance']
['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi']
['Crime', 'Drama']
['Drama']
['Mystery', 'Sci-Fi', 'Thriller']
['Children', 'Drama']
['Crime', 'Drama']
['Children', 'Comedy']
['Comedy', 'Romance']
['Drama']
['Drama', 'War']
['Action', 'Crime', 'D

In [41]:
print(df_genre.loc[0,['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']])

Adventure    0
Animation    0
Children     0
Comedy       0
Fantasy      0
Name: 0, dtype: int64


In [42]:
print(df_genre.loc[1,['Adventure', 'Children', 'Fantasy']])

Adventure    0
Children     0
Fantasy      0
Name: 1, dtype: int64


In [44]:
for row, element in enumerate(genere_list):
    df_genre.loc[row,element]=1

In [45]:
df_genre.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
df_genre[df_genre['(no genres listed)']>0].head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8684,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8687,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8782,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8836,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
df_movies = df_movies.join(df_genre)

In [49]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_movies.to_csv(r'ml-latest-small/movies_genre.csv')

In [52]:
df_ratings = pd.read_csv(r'ml-latest-small/ratings.csv')

In [53]:
df_ratings.shape

(100836, 4)

In [55]:
df_ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [56]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [61]:
df_ratings['userId'].unique().shape[0]

610

In [62]:
df_ratings['movieId'].unique().shape[0]

9724

In [63]:
df_ratings.drop(columns=['timestamp'],inplace=True,axis=1)

In [64]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [71]:
df_movie_ratings = df_ratings.merge(df_movies, on='movieId')

In [74]:
df_movie_ratings.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [75]:
df_movie_ratings.tail(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100835,610,163981,3.5,31 (2016),Horror,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
