## data cleaning

In [1]:
import pandas as pd
import json
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

### a) extracting data

In [2]:
moviedata = pd.read_csv('movie_data.csv',lineterminator='\n')
moviecredits = pd.read_csv('movie_credits.csv',lineterminator='\n')

In [3]:
moviedata.shape

(78900, 20)

In [4]:
moviecredits.shape

(78900, 4)

### b) merging the two dataframes

In [5]:
data = pd.merge(left=moviedata,right=moviecredits, left_on='id', right_on='movie_id', suffixes=('_left', '_right'))

In [6]:
data.shape

(78900, 24)

### c) data cleaning

In [7]:
data.drop(['id', 'title_right'], axis=1, inplace=True)
data = data.rename(columns={'title_left': 'title'})
data.head(3)

Unnamed: 0,budget,genres,homepage,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords,movie_id,cast,crew
0,0,"[{""id"": 35, ""name"": ""Comedy""}]",,en,Blondie,Blondie and Dagwood are about to celebrate the...,2.252,"[{""id"": 5, ""logo_path"": ""/71BqEFAF4V3qjjMPCpLu...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1938-11-30,0,70.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,,Blondie,7.1,5,"[{""id"": 190801, ""name"": ""blondie""}]",3924,"[{""adult"": false, ""gender"": 1, ""id"": 34178, ""k...","[{""adult"": false, ""gender"": 0, ""id"": 34170, ""k..."
1,0,"[{""id"": 12, ""name"": ""Adventure""}]",,de,Der Mann ohne Namen,Der Mann ohne Namen is a German adventure movi...,0.943,[],"[{""iso_3166_1"": ""DE"", ""name"": ""Germany""}]",1921-01-01,0,420.0,[],Released,,"Peter Voss, Thief of Millions",0.0,0,[],6124,"[{""adult"": false, ""gender"": 2, ""id"": 48038, ""k...","[{""adult"": false, ""gender"": 2, ""id"": 2902, ""kn..."
2,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",,fr,L'amour à vingt ans,Love at Twenty unites five directors from five...,3.184,"[{""id"": 38936, ""logo_path"": ""/ypvTqUeQOxORhFEF...","[{""iso_3166_1"": ""DE"", ""name"": ""Germany""}, {""is...",1962-06-22,0,110.0,"[{""english_name"": ""German"", ""iso_639_1"": ""de"",...",Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.8,36,[],8773,"[{""adult"": false, ""gender"": 2, ""id"": 1653, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 1650, ""kn..."


In [8]:
data['release_date'] = pd.to_datetime(data['release_date'])

In [9]:
#data['release_date'] = data[(data['release_date'].dt.year > 2000)]

data = data[(data['release_date'].dt.year > 1980)]

In [10]:
data = data[(data['revenue'] > 0)]

In [11]:
data = data.copy()

In [12]:
data.shape

(7339, 22)

In [13]:
data

Unnamed: 0,budget,genres,homepage,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords,movie_id,cast,crew
7,4000000,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...",https://www.miramax.com/movie/four-rooms/,en,Four Rooms,It's Ted the Bellhop's first night on the job....,13.497,"[{""id"": 14, ""logo_path"": ""/m6AHu84oZQxvq7n1rsv...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1995-12-09,4257354,98.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,5.7,2029,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...",5,"[{""adult"": false, ""gender"": 2, ""id"": 3129, ""kn...","[{""adult"": false, ""gender"": 1, ""id"": 3110, ""kn..."
8,21000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",8.104,"[{""id"": 33, ""logo_path"": ""/8lvHyhjr8oUKOOy2dKX...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",1993-10-15,12136938,110.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.5,214,"[{""id"": 520, ""name"": ""chicago, illinois""}, {""i...",6,"[{""adult"": false, ""gender"": 2, ""id"": 2880, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 2042, ""kn..."
12,94000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://movies.disney.com/finding-nemo,en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",98.136,"[{""id"": 3, ""logo_path"": ""/1TjvGVDMYsj6JBxOAkUH...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2003-05-30,940335536,100.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,There are 3.7 trillion fish in the ocean. They...,Finding Nemo,7.8,15611,"[{""id"": 970, ""name"": ""parent child relationshi...",12,"[{""adult"": false, ""gender"": 2, ""id"": 13, ""know...","[{""adult"": false, ""gender"": 2, ""id"": 7, ""known..."
13,55000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",,en,Forrest Gump,A man with a low IQ has accomplished great thi...,48.527,"[{""id"": 4, ""logo_path"": ""/fycMZt242LVjagMByZOL...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1994-07-06,677387716,142.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Life is like a box of chocolates...you never k...,Forrest Gump,8.5,21518,"[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...",13,"[{""adult"": false, ""gender"": 2, ""id"": 31, ""know...","[{""adult"": false, ""gender"": 2, ""id"": 37, ""know..."
14,15000000,"[{""id"": 18, ""name"": ""Drama""}]",,en,American Beauty,"Lester Burnham, a depressed suburban father in...",25.042,"[{""id"": 2721, ""logo_path"": null, ""name"": ""Jink...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1999-09-15,356296601,122.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Look closer.,American Beauty,8.0,9734,"[{""id"": 596, ""name"": ""adultery""}, {""id"": 970, ...",14,"[{""adult"": false, ""gender"": 2, ""id"": 1979, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 153, ""kno..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78597,750000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 28, ""n...",http://leomedias.info/barbet/film.html,en,Barbet : L'Homme de la situation,Cela fait maintenant plusieurs années que Barb...,0.600,[],"[{""iso_3166_1"": ""FR"", ""name"": ""France""}]",2012-08-25,1500000,20.0,"[{""english_name"": ""French"", ""iso_639_1"": ""fr"",...",Released,Il n'a plus rien à perdre.,Barbet : L'Homme de la situation,3.0,1,"[{""id"": 11314, ""name"": ""f word""}]",118784,"[{""adult"": false, ""gender"": 0, ""id"": 1068746, ...",[]
78683,30000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 27, ""nam...",,en,Bait,A freak tsunami traps shoppers at a coastal Au...,11.004,"[{""id"": 10950, ""logo_path"": ""/6TdE9m9KgV2az7vN...","[{""iso_3166_1"": ""SG"", ""name"": ""Singapore""}, {""...",2012-09-05,32500000,93.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Cleanup on aisle 7.,Bait,5.7,594,"[{""id"": 793, ""name"": ""drowning""}, {""id"": 1545,...",118957,"[{""adult"": false, ""gender"": 1, ""id"": 138010, ""...","[{""adult"": false, ""gender"": 2, ""id"": 29941, ""k..."
78701,0,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name...",,en,Keys to Tulsa,Richter Boudreau is on a bad streak: Languishi...,4.096,[],[],1997-04-11,57252,113.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Two Men Taken By One Woman,Keys to Tulsa,4.5,12,"[{""id"": 1936, ""name"": ""blackmail""}]",118991,"[{""adult"": false, ""gender"": 2, ""id"": 7036, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 2596, ""kn..."
78756,780000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 28, ""nam...",,ta,ஆரண்ய காண்டம்,A drama that unfolds between two rival mafia g...,2.571,"[{""id"": 14305, ""logo_path"": null, ""name"": ""Cap...","[{""iso_3166_1"": ""IN"", ""name"": ""India""}]",2011-06-10,1040344,153.0,"[{""english_name"": ""Tamil"", ""iso_639_1"": ""ta"", ...",Released,A page out of the life of a ganster.,Aaranya Kaandam,8.0,18,[],119123,"[{""adult"": false, ""gender"": 2, ""id"": 86014, ""k...","[{""adult"": false, ""gender"": 2, ""id"": 150684, ""..."


### d) json treatment

In [14]:
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages']

In [15]:
for column in json_columns:
    data[column] = data[column].apply(json.loads, encoding="utf-8")
data['crew'] = data['crew'].apply(json.loads, encoding="utf-8")    
data['cast'] = data['cast'].apply(json.loads, encoding="utf-8")    
data['production_companies'] = data['production_companies'].apply(json.loads, encoding="utf-8")

In [16]:
def process_jsoncols(colname):
    jsoncollist=[]
    for x in colname:
        jsoncollist.append(x['name'])
    return jsoncollist

In [17]:
for colname in json_columns:
    data[colname] = data[colname].apply(process_jsoncols)

In [18]:
data[['genres', 'keywords', 'production_countries', 'spoken_languages']].head()

Unnamed: 0,genres,keywords,production_countries,spoken_languages
7,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room...",[United States of America],[English]
8,"[Action, Thriller, Crime]","[chicago, illinois, drug dealer, escape, one n...","[Japan, United States of America]",[English]
12,"[Animation, Family]","[parent child relationship, sydney, australia,...",[United States of America],[English]
13,"[Comedy, Drama, Romance]","[vietnam veteran, hippie, washington dc, usa, ...",[United States of America],[English]
14,[Drama],"[adultery, parent child relationship, midlife ...",[United States of America],[English]


In [19]:
data['production_companies'] = data['production_companies'].apply(process_jsoncols)

In [20]:
data['production_companies'].head(2)

7                           [Miramax, A Band Apart]
8    [Universal Pictures, Largo Entertainment, JVC]
Name: production_companies, dtype: object

In [21]:
for index,x in zip(data.index,data['cast']):
    castlist=[]
    for i in range(len(x)):
        if (x[i]['order'] < 1):
            castlist.append((x[i]['name']))
    data.loc[index,'cast']=str(castlist)

In [22]:
data['cast'].head(2)

7          ['Tim Roth']
8    ['Emilio Estevez']
Name: cast, dtype: object

In [23]:
data['cast'] = data['cast'].str.strip('[]').str.replace("'",'').str.replace('"','').str.replace(' ','')

In [24]:
data['cast'].isnull().sum()

0

In [25]:
data['cast'].head(2)

7          TimRoth
8    EmilioEstevez
Name: cast, dtype: object

In [26]:
for index,x in zip(data.index,data['crew']):
    crewlist=[]
    for i in range(len(x)):
        if (x[i]['job'] == 'Director'):
            crewlist.append((x[i]['name']))
    data.loc[index,'crew']=str(crewlist)

In [27]:
data['crew'] = data['crew'].str.strip('[]').str.replace("'",'').str.replace('"','').str.replace(' ','')

In [28]:
listcols = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']

In [29]:
for colname in listcols:
    data[colname] = data[colname].apply(lambda x: ','.join(map(str, x)))

In [30]:
datacopy1 = data.copy()

In [31]:
data.head()

Unnamed: 0,budget,genres,homepage,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords,movie_id,cast,crew
7,4000000,"Crime,Comedy",https://www.miramax.com/movie/four-rooms/,en,Four Rooms,It's Ted the Bellhop's first night on the job....,13.497,"Miramax,A Band Apart",United States of America,1995-12-09,4257354,98.0,English,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,5.7,2029,"hotel,new year's eve,witch,bet,hotel room,sper...",5,TimRoth,"AllisonAnders,AlexandreRockwell,RobertRodrigue..."
8,21000000,"Action,Thriller,Crime",,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",8.104,"Universal Pictures,Largo Entertainment,JVC","Japan,United States of America",1993-10-15,12136938,110.0,English,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.5,214,"chicago, illinois,drug dealer,escape,one night...",6,EmilioEstevez,StephenHopkins
12,94000000,"Animation,Family",http://movies.disney.com/finding-nemo,en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",98.136,Pixar,United States of America,2003-05-30,940335536,100.0,English,Released,There are 3.7 trillion fish in the ocean. They...,Finding Nemo,7.8,15611,"parent child relationship,sydney, australia,ha...",12,AlbertBrooks,AndrewStanton
13,55000000,"Comedy,Drama,Romance",,en,Forrest Gump,A man with a low IQ has accomplished great thi...,48.527,"Paramount,The Steve Tisch Company",United States of America,1994-07-06,677387716,142.0,English,Released,Life is like a box of chocolates...you never k...,Forrest Gump,8.5,21518,"vietnam veteran,hippie,washington dc, usa,ment...",13,TomHanks,RobertZemeckis
14,15000000,Drama,,en,American Beauty,"Lester Burnham, a depressed suburban father in...",25.042,"Jinks/Cohen Company,DreamWorks Pictures",United States of America,1999-09-15,356296601,122.0,English,Released,Look closer.,American Beauty,8.0,9734,"adultery,parent child relationship,midlife cri...",14,KevinSpacey,SamMendes


### e) handling nulls 

In [32]:
data.isnull().sum()

budget                     0
genres                     0
homepage                5225
original_language          0
original_title             0
overview                  34
popularity                 0
production_companies       0
production_countries       0
release_date               0
revenue                    0
runtime                    0
spoken_languages           0
status                     0
tagline                 1714
title                      0
vote_average               0
vote_count                 0
keywords                   0
movie_id                   0
cast                       0
crew                       0
dtype: int64

In [33]:
data = datacopy1

In [34]:
data.drop(['homepage', 'tagline'], axis=1, inplace=True)

In [35]:
data

Unnamed: 0,budget,genres,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,vote_average,vote_count,keywords,movie_id,cast,crew
7,4000000,"Crime,Comedy",en,Four Rooms,It's Ted the Bellhop's first night on the job....,13.497,"Miramax,A Band Apart",United States of America,1995-12-09,4257354,98.0,English,Released,Four Rooms,5.7,2029,"hotel,new year's eve,witch,bet,hotel room,sper...",5,TimRoth,"AllisonAnders,AlexandreRockwell,RobertRodrigue..."
8,21000000,"Action,Thriller,Crime",en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",8.104,"Universal Pictures,Largo Entertainment,JVC","Japan,United States of America",1993-10-15,12136938,110.0,English,Released,Judgment Night,6.5,214,"chicago, illinois,drug dealer,escape,one night...",6,EmilioEstevez,StephenHopkins
12,94000000,"Animation,Family",en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",98.136,Pixar,United States of America,2003-05-30,940335536,100.0,English,Released,Finding Nemo,7.8,15611,"parent child relationship,sydney, australia,ha...",12,AlbertBrooks,AndrewStanton
13,55000000,"Comedy,Drama,Romance",en,Forrest Gump,A man with a low IQ has accomplished great thi...,48.527,"Paramount,The Steve Tisch Company",United States of America,1994-07-06,677387716,142.0,English,Released,Forrest Gump,8.5,21518,"vietnam veteran,hippie,washington dc, usa,ment...",13,TomHanks,RobertZemeckis
14,15000000,Drama,en,American Beauty,"Lester Burnham, a depressed suburban father in...",25.042,"Jinks/Cohen Company,DreamWorks Pictures",United States of America,1999-09-15,356296601,122.0,English,Released,American Beauty,8.0,9734,"adultery,parent child relationship,midlife cri...",14,KevinSpacey,SamMendes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78597,750000,"Thriller,Action",en,Barbet : L'Homme de la situation,Cela fait maintenant plusieurs années que Barb...,0.600,,France,2012-08-25,1500000,20.0,Français,Released,Barbet : L'Homme de la situation,3.0,1,f word,118784,,
78683,30000000,"Action,Horror,Thriller",en,Bait,A freak tsunami traps shoppers at a coastal Au...,11.004,"Screen Queensland,Pictures in Paradise,Media D...","Singapore,Australia",2012-09-05,32500000,93.0,English,Released,Bait,5.7,594,"drowning,supermarket,shark attack,flooding,aus...",118957,SharniVinson,KimbleRendall
78701,0,"Crime,Drama,Thriller",en,Keys to Tulsa,Richter Boudreau is on a bad streak: Languishi...,4.096,,,1997-04-11,57252,113.0,English,Released,Keys to Tulsa,4.5,12,blackmail,118991,EricStoltz,LeslieGreif
78756,780000,"Comedy,Action,Crime,Drama",ta,ஆரண்ய காண்டம்,A drama that unfolds between two rival mafia g...,2.571,Capital Film Works,India,2011-06-10,1040344,153.0,தமிழ்,Released,Aaranya Kaandam,8.0,18,,119123,JackieShroff,ThiagarajanKumararaja


In [36]:
data = data.replace(r'^\s*$', np.nan, regex=True)

In [37]:
data

Unnamed: 0,budget,genres,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,vote_average,vote_count,keywords,movie_id,cast,crew
7,4000000,"Crime,Comedy",en,Four Rooms,It's Ted the Bellhop's first night on the job....,13.497,"Miramax,A Band Apart",United States of America,1995-12-09,4257354,98.0,English,Released,Four Rooms,5.7,2029,"hotel,new year's eve,witch,bet,hotel room,sper...",5,TimRoth,"AllisonAnders,AlexandreRockwell,RobertRodrigue..."
8,21000000,"Action,Thriller,Crime",en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",8.104,"Universal Pictures,Largo Entertainment,JVC","Japan,United States of America",1993-10-15,12136938,110.0,English,Released,Judgment Night,6.5,214,"chicago, illinois,drug dealer,escape,one night...",6,EmilioEstevez,StephenHopkins
12,94000000,"Animation,Family",en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",98.136,Pixar,United States of America,2003-05-30,940335536,100.0,English,Released,Finding Nemo,7.8,15611,"parent child relationship,sydney, australia,ha...",12,AlbertBrooks,AndrewStanton
13,55000000,"Comedy,Drama,Romance",en,Forrest Gump,A man with a low IQ has accomplished great thi...,48.527,"Paramount,The Steve Tisch Company",United States of America,1994-07-06,677387716,142.0,English,Released,Forrest Gump,8.5,21518,"vietnam veteran,hippie,washington dc, usa,ment...",13,TomHanks,RobertZemeckis
14,15000000,Drama,en,American Beauty,"Lester Burnham, a depressed suburban father in...",25.042,"Jinks/Cohen Company,DreamWorks Pictures",United States of America,1999-09-15,356296601,122.0,English,Released,American Beauty,8.0,9734,"adultery,parent child relationship,midlife cri...",14,KevinSpacey,SamMendes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78597,750000,"Thriller,Action",en,Barbet : L'Homme de la situation,Cela fait maintenant plusieurs années que Barb...,0.600,,France,2012-08-25,1500000,20.0,Français,Released,Barbet : L'Homme de la situation,3.0,1,f word,118784,,
78683,30000000,"Action,Horror,Thriller",en,Bait,A freak tsunami traps shoppers at a coastal Au...,11.004,"Screen Queensland,Pictures in Paradise,Media D...","Singapore,Australia",2012-09-05,32500000,93.0,English,Released,Bait,5.7,594,"drowning,supermarket,shark attack,flooding,aus...",118957,SharniVinson,KimbleRendall
78701,0,"Crime,Drama,Thriller",en,Keys to Tulsa,Richter Boudreau is on a bad streak: Languishi...,4.096,,,1997-04-11,57252,113.0,English,Released,Keys to Tulsa,4.5,12,blackmail,118991,EricStoltz,LeslieGreif
78756,780000,"Comedy,Action,Crime,Drama",ta,ஆரண்ய காண்டம்,A drama that unfolds between two rival mafia g...,2.571,Capital Film Works,India,2011-06-10,1040344,153.0,தமிழ்,Released,Aaranya Kaandam,8.0,18,,119123,JackieShroff,ThiagarajanKumararaja


In [38]:
data.isnull().sum()

budget                    0
genres                   32
original_language         0
original_title            0
overview                 34
popularity                0
production_companies    438
production_countries    149
release_date              0
revenue                   0
runtime                   0
spoken_languages         67
status                    0
title                     0
vote_average              0
vote_count                0
keywords                829
movie_id                  0
cast                    186
crew                     21
dtype: int64

In [39]:
data.columns

Index(['budget', 'genres', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'title', 'vote_average', 'vote_count', 'keywords', 'movie_id', 'cast',
       'crew'],
      dtype='object')

In [40]:
data = data[['title', 'movie_id', 'overview', 'release_date', 'genres', 'budget', 'original_language', 'original_title',
       'popularity', 'production_companies', 'production_countries',
        'runtime', 'spoken_languages', 'status',
       'vote_average', 'vote_count', 'keywords', 'cast',
       'crew', 'revenue']]

In [41]:
data.to_csv('datacleaned.csv')