## Import libraries

In [1]:
import numpy as np
import pandas as pd

## Get the dataset

In [13]:
# Users dataset
columns = ['user_id', 'item_id', 'rating', 'timestamp']
user_ratings = pd.read_csv("ml-100k/u.data", sep = "\t", names = columns)

In [14]:
user_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
# Get the total number of unique users and movies in the dataset
print("Total users in the dataset: ", user_ratings['user_id'].nunique())
print("Total movies in the dataset: ", user_ratings['item_id'].nunique())

Total users in the dataset:  943
Total movies in the dataset:  1682


In [18]:
# Get the movies dataset
columns = ['item_id', 'title']
movie_titles = pd.read_csv("ml-100k/u.item", sep="|", header = None)

movie_titles = movie_titles[[0, 1]] # Subset the first two columns containing title
movie_titles.columns = columns

In [19]:
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [20]:
user_ratings_movies = pd.merge(user_ratings, movie_titles, on = 'item_id')

user_ratings_movies

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


## Exploratory Data Analysis

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

In [39]:
# Get the mean ratings for each of the movies
mean_ratings = user_ratings_movies.groupby('title').mean()['rating']
mean_ratings.head()

title
'Til There Was You (1997)    2.333333
1-900 (1994)                 2.600000
101 Dalmatians (1996)        2.908257
12 Angry Men (1957)          4.344000
187 (1997)                   3.024390
Name: rating, dtype: float64

In [40]:
# Now, we'll see how many reviews each movie has got
rating_counts = user_ratings_movies.groupby('title').count()['user_id']
rating_counts

title
'Til There Was You (1997)                   9
1-900 (1994)                                5
101 Dalmatians (1996)                     109
12 Angry Men (1957)                       125
187 (1997)                                 41
                                         ... 
Young Guns II (1990)                       44
Young Poisoner's Handbook, The (1995)      41
Zeus and Roxanne (1997)                     6
\C1 k\F6ldum klaka (Cold Fever) (1994)      1
unknown                                     9
Name: user_id, Length: 1664, dtype: int64

In [62]:
# Merge the above dataframes
ratings = mean_ratings.reset_index()
ratings = pd.merge(ratings, rating_counts, on = 'title')
ratings.columns = ['Movie Title', 'Mean Rating', 'Total Reviews']

ratings.sort_values(by = ['Total Reviews', 'Mean Rating'], ascending = False)

Unnamed: 0,Movie Title,Mean Rating,Total Reviews
1398,Star Wars (1977),4.358491,583
334,Contact (1997),3.803536,509
498,Fargo (1996),4.155512,508
1234,Return of the Jedi (1983),4.007890,507
860,Liar Liar (1997),3.156701,485
...,...,...,...
1582,"Very Natural Thing, A (1974)",1.000000,1
1584,"Vie est belle, La (Life is Rosey) (1987)",1.000000,1
1610,Wend Kuuni (God's Gift) (1982),1.000000,1
1646,"Woman in Question, The (1950)",1.000000,1
