Movie recommendation system

In [None]:
import pandas as pd
import numpy as np

In [None]:
#User ratings dataframe
ratings=pd.read_csv("./ml-100k/u.data",sep="\t",names=["user_id","movie_id","rating","timestamp"])
ratings.head()

In [None]:
#Movies dataframe
columns=["movie_id","title","release_date","video_release_date","imdb_url","unknown",
         "Action","Adventure","Animation","Children\'s","Comedy","Crime","Documentary",
         "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
         "Thriller","War","Western"]
movies=pd.read_csv("./ml-100k/u.item",sep="|",names=columns,encoding='latin1')
print(movies.head())

In [None]:
#Users dataframe
users=pd.read_csv("./ml-100k/u.user",sep="|",names=["user_id","age","gender","occupation","zip_code"])
print(users.head())

In [None]:
print(ratings.info())

In [None]:
print(movies.info())

In [None]:
print(users.info())

In [None]:
ratings.isnull().sum()

In [None]:
users.isnull().sum()

In [None]:
movies.isnull().sum()

In [None]:
print(movies.loc[movies["release_date"].isnull()])

In [None]:
print(movies[movies["title"]=="unknown"])
movies=movies[movies["title"]!="unknown"]
print("Removed movie with unknown title, genre and release date")

In [None]:
movies.drop(columns=["video_release_date"], inplace=True)   #remove video release date as all are null

In [None]:
print(movies.isnull().sum())

In [None]:
movies["imdb_url"]=movies["imdb_url"].fillna(value="none")   #Fill the missing url with a value none
movies.isnull().sum()

In [None]:
movies.info()

In [None]:
movies["release_date"]=pd.to_datetime(movies["release_date"])
print(movies["release_date"].head())

In [None]:
movies.info()

In [None]:
ratings[~ratings["movie_id"].isin(movies["movie_id"])]  #ratings for the movie that was removed

In [None]:
ratings=ratings[ratings["movie_id"].isin(movies["movie_id"])]
ratings.loc[ratings["movie_id"]==267]   #Empty row means successful deletion of ratings for the deleted movie

In [None]:
ratings.drop('timestamp',axis=1,inplace=True)
ratings.head()

Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Ratings distribution
ratings['rating'].value_counts().sort_index().plot(kind='bar')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
#Ratings per user
ratings_per_user = ratings.groupby('user_id').size().sort_index()
ratings_per_user.plot(kind='hist', bins=50)
plt.title('Ratings per User')
plt.xlabel('Number of Ratings')
plt.show()

In [None]:
# Ratings per movie
ratings_per_movie = ratings.groupby('movie_id').size()
ratings_per_movie.plot(kind='hist', bins=50)
plt.title('Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.show()

In [None]:
# Basic statistics of ratings dataframe
print(ratings.isnull().sum())
print("*"*50)
print(ratings.info())
print("*"*50)
print(ratings.describe())

In [None]:
print("Exploring the movies dataframe")
genre_columns=movies.columns[4:]
print("Genre columns => ",genre_columns)

In [None]:
# Sum of movies in each genre (A movie can belong to several genre)
genre_counts = movies[genre_columns].sum().sort_values().plot(kind="bar")