In [1]:
import sklearn
import numpy as np
import pandas as pd
import random
import math
import os




# Data Preprocessing
# Sampling
Since we are using 25M dataset, we are trying to reduce the dataset size keeping in mind the contraints of the computing device. We are using sampling method to take into only a certain portion of the dataset and save that sample dataset so that we do not need to perform sampling everytime running this program. 



In [2]:
if not os.path.isfile("./datasets/sampled_ratings.csv"):
    # Reading csv files using absolute path
    #We are not taking into consideration the 'timestamp' at which the movie was rated by the user.
    ratings = pd.read_csv("./datasets/ratings.csv",usecols=['userId','movieId','rating'])
    # Assuming people like the movies they rate 3.5 or higher
    ratings = ratings[ratings['rating'] >= 3.5]
    # Using only a subset of the dataset by sampling % of the dataset
    ratings = ratings.sample(frac=0.2)
    # Save the sampled dataset to local storage so that we do not need to perform sampling every single time which gives a randomly selected portion of the original dataset
    ratings.to_csv('./datasets/sampled_ratings.csv')
else:
    ratings = pd.read_csv("./datasets/sampled_ratings.csv",usecols=['userId','movieId','rating'])

In [None]:
# Checking for null values in ratings
ratings['rating'].isnull().any()

In [3]:
# Read movies dataset
movies = pd.read_csv("./datasets/movies.csv")


In [4]:
# Checking for movies with no genres listed
movies = movies.dropna(subset=['genres'])

ratings = ratings.dropna(subset=['rating'])

In [5]:
# summary of the dataset "ratings"
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3126026 entries, 0 to 3126025
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 95.4 MB


In [6]:
# get top 5 rows in the ratings dataset
ratings.head()

Unnamed: 0,userId,movieId,rating
0,40258,3949,5.0
1,9648,1246,4.0
2,135088,8970,5.0
3,95591,40583,4.0
4,154793,3978,4.0


In [7]:
# find the minimum rating value in the dataset
print("Minimum ratings ",ratings['rating'].min())
print("Maximum ratings ",ratings['rating'].max())
print("Average ratings ",ratings['rating'].mean())

Minimum ratings  3.5
Maximum ratings  5.0
Average ratings  4.199805439877979


In [8]:
movie_list = ratings['movieId'].unique()
print("No of unique movies: ", len(movie_list))

No of unique movies:  29950


In [9]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
# Extract year from the movie and add a year column to movies
movies['year'] = movies['title'].str.extract('.*\((.*)\).*', expand=True)
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


# Filtering most-rated movies
The size of the dataset is decreased further by considering the movies that are most rated. We are considering 1000 most rated movies.

In [11]:
def most_rated_movies(ratings):
    most_rated = ratings['movieId'].value_counts()[:1000]
    #print("5 most-rated movies: \n", most_rated.head())
    #print(ratings.info)
    ratings = ratings.loc[ratings['movieId'].isin(most_rated)]
    return ratings

In [12]:
ratings = most_rated_movies(ratings)
print("\nTotal ratings: ", len(ratings))

users = ratings["userId"].unique().tolist()
print("Total users: ", len(users))

unique_movies = ratings["movieId"].unique().tolist()
print("Total movies: ",len(unique_movies))


Total ratings:  353257
Total users:  98043
Total movies:  724


In [13]:
movies = movies.loc[movies['movieId'].isin(unique_movies)]
movies

Unnamed: 0,movieId,title,genres,year
738,754,Gold Diggers: The Secret of Bear Mountain (1995),Adventure|Children,1995
739,755,Kim (1950),Children|Drama,1950
740,756,Carmen Miranda: Bananas Is My Business (1994),Documentary,1994
741,757,Ashes of Time (Dung che sai duk) (1994),Drama,1994
743,759,Maya Lin: A Strong Clear Vision (1994),Documentary,1994
...,...,...,...,...
7498,7850,Hallelujah I'm a Bum (1933),Musical,1933
7830,8504,Box of Moon Light (1996),Comedy|Drama,1996
7926,8639,"Clearing, The (2004)",Drama|Thriller,2004
7937,8650,Long Day's Journey Into Night (1962),Drama,1962


In [14]:
# Reset index for the filtered dataset
ratings= ratings.reset_index(drop = True)
ratings

Unnamed: 0,userId,movieId,rating
0,100351,1277,3.5
1,76184,1797,3.5
2,80616,780,5.0
3,82758,1097,3.5
4,15987,1291,4.0
...,...,...,...
353252,12362,1263,4.0
353253,71065,2918,5.0
353254,115431,2640,4.0
353255,10939,903,5.0


In [15]:
#Merging users' favorite movies' ratings and movies attributes 
movies = movies.loc[movies['movieId'].isin(list(map(str, np.unique(ratings['movieId']))))]#.reset_index(drop=True)
movie_ratings = ratings.merge(movies, on='movieId', how = 'inner')
movie_ratings = movie_ratings.dropna(subset=['genres'])
movie_ratings

Unnamed: 0,userId,movieId,rating,title,genres,year
0,100351,1277,3.5,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990
1,75251,1277,4.0,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990
2,130058,1277,4.0,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990
3,157362,1277,4.0,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990
4,113443,1277,4.5,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990
...,...,...,...,...,...,...
353252,133277,4172,5.0,Simon Magus (1999),Drama|Fantasy|Mystery|Romance,1999
353253,68270,1708,4.0,Ill Gotten Gains (1997),Drama,1997
353254,2177,7726,3.5,Boy Meets Girl (1998),Comedy|Romance,1998
353255,90050,2172,4.0,"Strike! (a.k.a. All I Wanna Do, The Hairy Bird...",Comedy|Drama,1998


In [17]:
# Save the results of merging movies and genres
if not os.path.isfile("./datasets/movie_ratings.csv"):
    movie_ratings.to_csv('./datasets/movie_ratings.csv')