In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

In [3]:
master_dataset=pd.read_csv("master_dataset.csv")
master_dataset.shape

(46628, 27)

In [4]:
master_dataset['cast']= master_dataset['cast'].apply(literal_eval)
master_dataset['crew']= master_dataset['crew'].apply(literal_eval)
master_dataset['keywords']= master_dataset['keywords'].apply(literal_eval)

In [None]:
master_dataset['cast']= master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast']= master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

master_dataset['keywords']= master_dataset['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

master_dataset['director']=master_dataset['crew'].apply(lambda x: next((i['name'] for  i in x if i.get('job')=='Director'),np.nan))

In [None]:
master_dataset['main_director']=master_dataset['director']

In [7]:
master_dataset['director']= master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director']= master_dataset['director'].apply(lambda x: [x,x,x])

In [8]:
master_dataset['cast']=master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [9]:
s = (
    master_dataset['keywords'].explode()
    .value_counts()
)
s = s[s > 1]

In [10]:
from nltk.stem.snowball import SnowballStemmer

In [11]:
stemmer = SnowballStemmer('english')

master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [stemmer.stem(i) for i in x if len(i) > 1]
)


In [12]:
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [i.replace(" ", "").lower() for i in x]
)


In [13]:
master_dataset['keywords'].head(5)

0    [jealousi, toy, boy, friendship, friend, rival...
1    [boardgam, disappear, basedonchildren'sbook, n...
2       [fish, bestfriend, duringcreditssting, oldmen]
3    [basedonnovel, interracialrelationship, single...
4    [babi, midlifecrisi, confid, age, daughter, mo...
Name: keywords, dtype: object

In [14]:
for col in ['keywords', 'cast', 'director', 'genres']:
    master_dataset[col] = master_dataset[col].apply(lambda x: x if isinstance(x, list) else [])

master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']

master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join([str(i) for i in x]))


In [16]:
master_dataset['soup'].head(5)

0    jealousi toy boy friendship friend rivalri boy...
1    boardgam disappear basedonchildren'sbook newho...
2    fish bestfriend duringcreditssting oldmen walt...
3    basedonnovel interracialrelationship singlemot...
4    babi midlifecrisi confid age daughter motherda...
Name: soup, dtype: object

In [17]:
print(master_dataset.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords', 'cast', 'crew', 'director',
       'main_director', 'soup'],
      dtype='object')


In [19]:
master_dataset.drop(['adult', 'belongs_to_collection', 'budget','homepage','original_language', 'production_companies','production_countries', 'revenue', 'runtime','spoken_languages','status','video'],axis=1,inplace=True)
master_dataset.drop(['overview', 'tagline','vote_average', 'vote_count', 'cast', 'crew','keywords', 'director'],axis=1,inplace=True)
master_dataset.drop(['id','imdb_id','original_title','poster_path','genres'],axis=1,inplace=True)

In [21]:
master_dataset['popularity'] = master_dataset.apply(lambda r: r['popularity'] if type(r['popularity']) == float else np.nan, axis=1)
master_dataset['main_director'] = master_dataset.apply(lambda r: r['main_director'] if len(r['main_director']) > 1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)


In [22]:
master_dataset=master_dataset.sort_values(by='popularity',ascending=False)

In [23]:
master_dataset.head(5)

Unnamed: 0,popularity,release_date,title,main_director,soup
30876,547.488298,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
33535,294.337037,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
43376,287.253654,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...
44797,228.032744,2017-06-28,Baby Driver,Edgar Wright,robberi atlanta music crimeboss romanc tinnitu...
24603,213.849907,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...


In [24]:
master_dataset=master_dataset.drop(['popularity'], axis=1)
master_dataset.dropna(inplace=True)


In [25]:
master_dataset.head(3)

Unnamed: 0,release_date,title,main_director,soup
30876,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
33535,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
43376,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...


In [27]:
master_dataset.reset_index(inplace=True, drop=True)
master_dataset.head(3)

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
2,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...


In [28]:
master_dataset['release_date'] = master_dataset.apply(lambda r: r['release_date'] if len(r['release_date']) > 1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)


In [29]:
master_dataset = master_dataset[:2500]

In [30]:
master_dataset.head()

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
2,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...
3,2017-06-28,Baby Driver,Edgar Wright,robberi atlanta music crimeboss romanc tinnitu...
4,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...


In [31]:
master_dataset.shape

(2500, 4)

In [32]:
master_dataset.to_csv('master_dataset_final.csv',index=False)