In [5]:
#load python packages
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

# Data Read in and Organization

In [6]:
#get current work directory
os.getcwd()

'/Users/Sue/Documents/2020_study/springboard/capstone_project2_movie'

In [7]:
# read in data as df
file = '/Users/Sue/Documents/2020_study/springboard/capstone_project2_movie/Dataset/tmdb-box-office-prediction/train.csv'
file_test = '/Users/Sue/Documents/2020_study/springboard/capstone_project2_movie/Dataset/tmdb-box-office-prediction/test.csv'
data = pd.read_csv(file_test)
df = pd.DataFrame(data)
print(df.head(5))

     id                              belongs_to_collection   budget  \
0  3001  [{'id': 34055, 'name': 'Pokémon Collection', '...        0   
1  3002                                                NaN    88000   
2  3003                                                NaN        0   
3  3004                                                NaN  6800000   
4  3005                                                NaN  2000000   

                                              genres  \
0  [{'id': 12, 'name': 'Adventure'}, {'id': 16, '...   
1  [{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...   
2  [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...   
3  [{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...   
4  [{'id': 36, 'name': 'History'}, {'id': 99, 'na...   

                                            homepage    imdb_id  \
0  http://www.pokemon.com/us/movies/movie-pokemon...  tt1226251   
1                                                NaN  tt0051380   
2                                  

In [8]:
#create subdirectory to save figures

# Data Organization

# Data Definition

In [9]:
#column names
df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew'],
      dtype='object')

In [10]:
#data type
df.dtypes

id                         int64
belongs_to_collection     object
budget                     int64
genres                    object
homepage                  object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
Keywords                  object
cast                      object
crew                      object
dtype: object

In [11]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     4398 non-null   int64  
 1   belongs_to_collection  877 non-null    object 
 2   budget                 4398 non-null   int64  
 3   genres                 4382 non-null   object 
 4   homepage               1420 non-null   object 
 5   imdb_id                4398 non-null   object 
 6   original_language      4398 non-null   object 
 7   original_title         4398 non-null   object 
 8   overview               4384 non-null   object 
 9   popularity             4398 non-null   float64
 10  poster_path            4397 non-null   object 
 11  production_companies   4140 non-null   object 
 12  production_countries   4296 non-null   object 
 13  release_date           4397 non-null   object 
 14  runtime                4394 non-null   float64
 15  spok

In [12]:
print(df.isna().sum())

id                          0
belongs_to_collection    3521
budget                      0
genres                     16
homepage                 2978
imdb_id                     0
original_language           0
original_title              0
overview                   14
popularity                  0
poster_path                 1
production_companies      258
production_countries      102
release_date                1
runtime                     4
spoken_languages           42
status                      2
tagline                   863
title                       3
Keywords                  393
cast                       13
crew                       22
dtype: int64


In [13]:
df.nunique()/df.count()

id                       1.000000
belongs_to_collection    0.633979
budget                   0.116189
genres                   0.251255
homepage                 0.987324
imdb_id                  1.000000
original_language        0.008868
original_title           0.989768
overview                 0.999772
popularity               1.000000
poster_path              1.000000
production_companies     0.813043
production_countries     0.106611
release_date             0.748010
runtime                  0.035275
spoken_languages         0.120753
status                   0.000682
tagline                  0.998303
title                    0.987941
Keywords                 0.970037
cast                     0.995439
crew                     1.000000
dtype: float64

In [14]:
#duplicate check
duplicate_df = df[df.duplicated()]
print(duplicate_df.shape)
# there is no duplicate data

(0, 22)


In [15]:
df.describe()

Unnamed: 0,id,budget,popularity,runtime
count,4398.0,4398.0,4398.0,4394.0
mean,5199.5,22649290.0,8.55023,107.622212
std,1269.737571,36899910.0,12.209014,21.05829
min,3001.0,0.0,1e-06,0.0
25%,4100.25,0.0,3.895186,94.0
50%,5199.5,7450000.0,7.482241,104.0
75%,6298.75,28000000.0,10.938524,118.0
max,7398.0,260000000.0,547.488298,320.0


# roadmap
# what need to do with data
1. drop columns:
    id is duplicated to index, will drop it
    belongs_to_collection and homepage columns have over 2/3 data missing, both columns content should be similar as overview or keywords, and it is difficult to fill up over 2/3 data with limited source, so will drop these two columns.
2. fill in numericl NAN data
    only 'runtime' has NAN, and only 2/3000 missing, will fill with average time.data type conversion:
3. drop more columns:
    imdb_id, which is identical id linked to imdb, which is not useful for this case study, will drop it
    overview, the tagline and keyword has contained key info for movie analysis, this is duplicated
4.  'release date' convert to datatime type


# drop columns

In [16]:
df = df.drop(['id','belongs_to_collection','homepage','imdb_id','overview'], axis = 1)

In [17]:
df.iloc[0]

budget                                                                  0
genres                  [{'id': 12, 'name': 'Adventure'}, {'id': 16, '...
original_language                                                      ja
original_title                                         ディアルガVSパルキアVSダークライ
popularity                                                        3.85153
poster_path                              /tnftmLMemPLduW6MRyZE0ZUD19z.jpg
production_companies                                                  NaN
production_countries    [{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...
release_date                                                      7/14/07
runtime                                                                90
spoken_languages        [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
status                                                           Released
tagline                 Somewhere Between Time & Space... A Legend Is ...
title                                 

# check and fill numerical NAN

In [18]:
nas=pd.DataFrame(df.isnull().sum().sort_values(ascending=False)/len(df),columns = ['percent'])
pos = nas['percent'] > 0
nas[pos]

Unnamed: 0,percent
tagline,0.196226
Keywords,0.089359
production_companies,0.058663
production_countries,0.023192
spoken_languages,0.00955
crew,0.005002
genres,0.003638
cast,0.002956
runtime,0.00091
title,0.000682


In [19]:
# fill in zero for null runtime, then all numerical data are ready
df['runtime'] = df['runtime'].fillna(value = 0)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4398 non-null   int64  
 1   genres                4382 non-null   object 
 2   original_language     4398 non-null   object 
 3   original_title        4398 non-null   object 
 4   popularity            4398 non-null   float64
 5   poster_path           4397 non-null   object 
 6   production_companies  4140 non-null   object 
 7   production_countries  4296 non-null   object 
 8   release_date          4397 non-null   object 
 9   runtime               4398 non-null   float64
 10  spoken_languages      4356 non-null   object 
 11  status                4396 non-null   object 
 12  tagline               3535 non-null   object 
 13  title                 4395 non-null   object 
 14  Keywords              4005 non-null   object 
 15  cast                 

In [21]:
#convert release time to datatime
#leave all others to feature engineering

In [22]:
df.to_csv('/Users/Sue/Documents/2020_study/springboard/capstone_project2_movie/Dataset/tmdb-box-office-prediction/0721wragling_test_t1.csv',index = False)

In [23]:
# define get_dictioinary function to achieve dictionary from lists
def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

In [24]:
# clean up 3 columns including keywords
df_train = df
list_process = ['genres','production_companies','production_countries','Keywords']
list_process2 = ['spoken_languages']
for i in list_process:
    df_train[i] = df_train[i].map(lambda x: sorted([d['name'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
    #gi = df_train[i].str.get_dummies(sep=',')
    #df_train = pd.concat([df_train, gi], axis=1, sort=False)
for i in list_process2:
    df_train[i] = df_train[i].map(lambda x: sorted([d['iso_639_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
    #gi = df_train[i].str.get_dummies(sep=',')
    #df_train = pd.concat([df_train, gi], axis=1, sort=False)

In [25]:
# clean up column genres
#df_train['genres'] = df_train['genres'].map(lambda x: sorted([d['name'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
#genres_split = df_train['genres'].str.get_dummies(sep=',')
#df_train = pd.concat([df_train, genres_split], axis=1, sort=False)

In [26]:
#df_train.head(5)

# release date convert to datetime datatype

In [27]:
df_train['release_date'].dtype

dtype('O')

In [28]:
df_train['release_date']=pd.to_datetime(df_train['release_date'])

In [29]:
df_train['release_date'][0]

Timestamp('2007-07-14 00:00:00')

In [30]:
df_train.head(5)

Unnamed: 0,budget,genres,original_language,original_title,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,0,"Adventure,Animation,Family,Fantasy",ja,ディアルガVSパルキアVSダークライ,3.851534,/tnftmLMemPLduW6MRyZE0ZUD19z.jpg,,"Japan,United States of America",2007-07-14,90.0,"en,ja",Released,Somewhere Between Time & Space... A Legend Is ...,Pokémon: The Rise of Darkrai,"pocket monsters,pok√©mon","[{'cast_id': 3, 'character': 'Tonio', 'credit_...","[{'credit_id': '52fe44e7c3a368484e03d683', 'de..."
1,88000,"Horror,Science Fiction",en,Attack of the 50 Foot Woman,3.559789,/9MgBNBqlH1sG4yG2u4XkwI5CoJa.jpg,Woolner Brothers Pictures Inc.,United States of America,2058-05-19,65.0,en,Released,A titanic beauty spreads a macabre wave of hor...,Attack of the 50 Foot Woman,"alien,b movie,cheating husband,chrysler imperi...","[{'cast_id': 2, 'character': 'Nancy Fowler Arc...","[{'credit_id': '55807805c3a3685b1300060b', 'de..."
2,0,"Comedy,Romance",en,Addicted to Love,8.085194,/ed6nD7h9sbojSWY2qrnDcSvDFko.jpg,"Miramax,Outlaw Productions (I),Warner Bros.",United States of America,1997-05-23,100.0,en,Released,A Comedy About Lost Loves And Last Laughs,Addicted to Love,"break-up,jealousy,love,revenge","[{'cast_id': 11, 'character': 'Maggie', 'credi...","[{'credit_id': '52fe4330c3a36847f8041367', 'de..."
3,6800000,"Drama,Mystery,War",fr,Incendies,8.596012,/sEUG3qjxwHjxkzuO7plrRHhOZUH.jpg,"Micro scope,Phi Group,TS Productions","Canada,France",2010-09-04,130.0,"ar,en,fr",Released,The search began at the opening of their mothe...,Incendies,"checkpoint,christian,interpreter,massacre,midd...","[{'cast_id': 6, 'character': 'Nawal', 'credit_...","[{'credit_id': '56478092c3a36826140043af', 'de..."
4,2000000,"Documentary,History",en,Inside Deep Throat,3.21768,/n4WC3zbelz6SG7rhkWbf8m9pMHB.jpg,,United States of America,2005-02-11,92.0,en,Released,It was filmed in 6 days for 25 thousand dollar...,Inside Deep Throat,"1970s,sexual revolution,unsimulated sex,usa","[{'cast_id': 1, 'character': 'Narrator (voice)...","[{'credit_id': '52fe44ce9251416c75041967', 'de..."


In [31]:
df_train.to_csv('/Users/Sue/Documents/2020_study/springboard/capstone_project2_movie/Dataset/tmdb-box-office-prediction/0721wragling_test_t2.csv',index = False)