# Movielens dataset exploration

## Prerequisites

In [10]:
import numpy as np
import pandas as pd

from IPython.display import display

import os

## Load data

Download data from https://www.kaggle.com/grouplens/movielens-20m-dataset and put it to the data directory.

Download data from https://www.kaggle.com/ayushimishra2809/movielens-dataset and put it to the data directory. 

In [98]:
DATA_PATH = "../data"
print(os.listdir(DATA_PATH))

['100K', '20M']


Load movie information to a dataset

In [None]:
def load_movielens_20M(path):
    """
    Modified from https://www.kaggle.com/kanncaa1/recommendation-systems-tutorial
    """
    movies = pd.read_csv(os.path.join(path, "movie.csv"))
    movies = movies.loc[:,["movieId", "title"]]
    rating = pd.read_csv(os.path.join(path, "rating.csv"))
    rating = rating.loc[:,["userId","movieId", "rating"]]
    data_combined = pd.merge(movies, rating, on='movieId')
    return data_combined

In [60]:
# Takes long time to run, 20 M is a big dataset
# data_combined_20m = load_movielens_20M(os.path.join(DATA_PATH, "20M"))
# print(f"Unique users {data_combined_20m['userId'].unique().shape[0]}")
# print(f"Unique movies {data_combined_20m['movieId'].unique().shape[0]}")
# display(data_combined_20m.head(5))

Unique users 138493
Unique movies 26744


Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),3,4.0
1,1,Toy Story (1995),6,5.0
2,1,Toy Story (1995),8,4.0
3,1,Toy Story (1995),10,4.0
4,1,Toy Story (1995),11,4.5


In [99]:
data_combined_100k = load_movielens_20M(os.path.join(DATA_PATH, "100K"))

print(f"Unique users {data_combined_100k['userId'].unique().shape[0]}")
print(f"Unique movies {data_combined_100k['movieId'].unique().shape[0]}")
display(data_combined_100k.head(5))

Unique users 668
Unique movies 10325


Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),2,5.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),8,5.0
3,1,Toy Story (1995),11,4.0
4,1,Toy Story (1995),14,4.0


## Data exploration

In [63]:
data_combined_100k.groupby('title')['rating'].mean().head()

title
'71 (2014)                                 3.500
'Hellboy': The Seeds of Creation (2004)    3.000
'Round Midnight (1986)                     2.500
'Til There Was You (1997)                  4.000
'burbs, The (1989)                         3.125
Name: rating, dtype: float64

In [65]:
data_combined_100k.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Saddest Music in the World, The (2003)    5.0
Interstate 60 (2002)                      5.0
Gunfighter, The (1950)                    5.0
Heima (2007)                              5.0
Limelight (1952)                          5.0
Name: rating, dtype: float64

In [66]:
data_combined_100k.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Pulp Fiction (1994)                 325
Forrest Gump (1994)                 311
Shawshank Redemption, The (1994)    308
Jurassic Park (1993)                294
Silence of the Lambs, The (1991)    290
Name: rating, dtype: int64

In [84]:
ratings_mean_count = pd.DataFrame(data_combined_100k.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(data_combined_100k.groupby('title')['rating'].count())
ratings_mean_count.head(5)

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),3.5,1
'Hellboy': The Seeds of Creation (2004),3.0,1
'Round Midnight (1986),2.5,1
'Til There Was You (1997),4.0,3
"'burbs, The (1989)",3.125,20


In [83]:
user_movie_rating = data_combined_100k.pivot_table(index='userId', columns='title', values='rating')
user_movie_rating.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


Correlation alone is not a good metric for similarity because there can be a user who watched movie and only one other movie and rated both of them as 5. A solution to this problem is to retrieve only those correlated movies that have at least more than 50 ratings.

In [97]:
movie_title = '...And Justice for All (1979)'
movie_watched = user_movie_rating[movie_title]
similarity_with_other_movies = user_movie_rating.corrwith(movie_watched)
correlations_df = pd.DataFrame(similarity_with_other_movies, columns=['Correlation'])
correlations_df = correlations_df.join(ratings_mean_count['rating_counts'])
correlations_df.dropna(inplace=True)
correlations_df.head(10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",-1.0,20
(500) Days of Summer (2009),-0.5,37
...And Justice for All (1979),1.0,10
10 Things I Hate About You (1999),0.774597,59
101 Dalmatians (One Hundred and One Dalmatians) (1961),-0.188982,37
12 Angry Men (1957),0.912871,63
"13th Warrior, The (1999)",0.126491,41
15 Minutes (2001),1.0,8
16 Blocks (2006),0.57735,14
1941 (1979),1.0,4


In [96]:
correlations_df[correlations_df['rating_counts']>50].sort_values('Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Young Frankenstein (1974),1.0,72
American History X (1998),1.0,103
Chasing Amy (1997),1.0,68
"Client, The (1994)",1.0,65
Clueless (1995),1.0,98
"Green Mile, The (1999)",1.0,101
Leaving Las Vegas (1995),1.0,87
"Nutty Professor, The (1996)",1.0,87
Princess Mononoke (Mononoke-hime) (1997),1.0,52
RoboCop (1987),1.0,69
