In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

In [6]:
master_dataset_new = pd.read_csv('master_dataset_new.csv')
print(master_dataset_new.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'main_director', 'soup'],
      dtype='object')


In [8]:
## Removing unwanted columns from the dataset - these features can be used if you wish to add more features to your recommender system.
## We are not going to use them, so we are removing them.
master_dataset_new.drop(['adult', 'belongs_to_collection', 'budget','homepage','original_language', 'production_companies','production_countries', 'revenue', 'runtime','spoken_languages','status','video'],axis=1,inplace=True)
master_dataset_new.drop(['overview', 'tagline','vote_average', 'vote_count', 'cast', 'crew','keywords', 'director'],axis=1,inplace=True)
master_dataset_new.drop(['id','imdb_id','original_title','poster_path','genres'],axis=1,inplace=True)

In [9]:
## Checking popularity column for being non-float data type and removing them
master_dataset_new['popularity'] = master_dataset_new.apply(lambda r: r['popularity'] if type(r['popularity']) == float else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)


In [10]:
## Checking director column for being non-string data type and removing them
master_dataset_new['main_director'] = master_dataset_new.apply(lambda r: r['main_director'] if len(r['main_director']) > 1 else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)


In [11]:
## Sorting the whole dataset based on popularity. This will help us to take top X number of movies based on popularity.
master_dataset_new.sort_values(by='popularity', ascending=False, inplace=True)


In [12]:
## Dropping popularity column after sorting based on popularity
master_dataset_new.drop(['popularity'], axis=1, inplace=True)
master_dataset_new.dropna(inplace=True)


In [13]:
## Reset index because after sorting, the index values have changed.
master_dataset_new.reset_index(inplace=True, drop=True)

In [14]:
## Checking release date column for being non-string data type and removing them
master_dataset_new['release_date'] = master_dataset_new.apply(lambda r: r['release_date'] if len(r['release_date']) > 1 else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)


In [16]:
## For Demo, we will take top 2500 movies, which is hosted online already.
master_dataset_new = master_dataset_new[:2500]


In [18]:
## This is our final dataset which we will be using for training our word and cosine similarity matrix
master_dataset_new.head()

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
2,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...
3,2017-06-28,Baby Driver,Edgar Wright,robberi atlanta music crimeboss romanc tinnitu...
4,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...


In [20]:
print(master_dataset_new.shape)

(2500, 4)


In [21]:
master_dataset_new.to_csv('master_dataset_final.csv', index=False)