In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
credits.head()

In [None]:
set(list(movies.columns)).intersection(set(list(credits.columns)))

In [None]:
movies = pd.merge(movies, credits, on = ['title'])

In [None]:
movies.head()

In [None]:
def fillblank(ip):
    if len(ip)<1:
        return 'Not Available'
    else:
        return ip

In [None]:
round(movies.isnull().sum()/len(movies.index)*100, 2)

In [None]:
# only tagline and homepage have the missing values so we drop those columns
movies = movies.drop(['homepage', 'tagline'],  axis = 1)

In [None]:
round(movies.isnull().sum()/len(movies.index)*10000, 2)

In [None]:
# only insigificant amounts of missing values left now in columns like overview, release_date and runtime
# we will do missing value imputations for them all or just drop them if they have no use. 
# we could derive a new column from overview that counts the number of unique overview words
from nltk import word_tokenize
from nltk.corpus import stopwords

stops = stopwords.words('english')

# INCOMPLETE

In [None]:
# we can derive lots of variables from release_date feature and see if they hold any importance by testing multicollinearity
# and their relation with other features can also be seen
# we will extract the week of the month, the year, the month
movies.release_date = pd.to_datetime(movies.release_date)
movies['release_year'] = movies['release_date'].dt.year
movies['release_year'] = movies['release_date'].dt.month
def giveday(w):
    if w<8 and w>0:
        return 1
    elif w>7 and w<15:
        return 2
    elif w>14 and w<22:
        return 3
    else: 
        return 4
        
movies['release_week'] = pd.Series([giveday(w) for w in movies['release_date'].dt.day])
movies = movies.drop('release_date', axis = 1)

In [None]:
# now we treat runtime
movies.runtime
# runtime is in minutes, which seems fine so far. 

In [None]:
movies.info()

In [None]:
# features that are objects need to be converted to numerical forms
# genres, original_language, original_title, overview, production_companies, production_countries, spoken_languages, status
# title, cast, crew and so on

In [None]:
# let's deal with genres here, clean it up and get all the genres first, 
movies.genres
movies['number_of_genres'] = pd.Series([len(w.split('name'))-1 for w in movies.genres])

movies['genres'] = pd.Series([re.findall('"name": "(\w+)"', w) for w in movies.genres])

# there are blank lists here, need to replace them with the mode
movies['genres'] = pd.Series([fillblank(w) for w in movies['genres']])

In [None]:
# now we deal with original_language
movies.original_language.value_counts()[0]/len(movies.index)
# 93% of the original languages are englihs, so this is clearly an imbalanced feature 


In [None]:
# now we consider the original_title. Is it the same as title?
len(set(list(movies.original_title)).intersection(set(list(movies.title))))/len(movies.index)
# so 94% titles are original_titles. So we can simply drop the original title feature
movies = movies.drop('original_title', axis = 1)

In [None]:
# overview has been dealt with above. We deal with production_companies now 
movies['production_companies'] = pd.Series([re.findall('"name": "([\w ]+)",', w) for w in movies.production_companies])
# we can derive a feature here 
movies['#production_companies'] = pd.Series([len(w) for w in movies['production_companies']])
movies['production_companies'] = pd.Series([fillblank(w) for w in movies['production_companies']])
movies['production_companies'] = pd.Series([w[0] for w in movies['production_companies']])

In [None]:
movies['#production_countries'] = pd.Series([len(w.split('name'))-1 for w in movies.production_countries])
movies['production_countries'] = pd.Series([re.findall('"name": "([\w ]+)"}', w) for w in movies.production_countries])
movies['production_countries'] = pd.Series([fillblank(w) for w in movies['production_countries']])
# INCOMPLETE

In [None]:
movies['number_of_spoken_languages'] = pd.Series([len(w.split('{'))-1 for w in movies.spoken_languages])
movies['spoken_languages'] = pd.Series([re.findall('"name": "(\w+)"', w) for w in movies.spoken_languages])

In [None]:
movies['status'] = movies.status.astype(str)
# status needs to be checked for variations
movies['status'].value_counts()
# most of the values are for Released only, so we can remove this feature, and it 
# also does not make sense in the case of this problem that we have with us 
movies = movies.drop(['status'], axis = 1)

In [None]:
# to be generalized across all cast values to get actor names:
movies['actors'] = pd.Series([re.findall('name\": \"([\w ]+)\",', w) for w in movies.cast])
movies = movies.drop('cast', axis = 1)
movies['#actors'] = pd.Series([len(w) for w in movies['actors']])
movies['actors'] = pd.Series([fillblank(w) for w in movies['actors']])
movies['actors'] = pd.Series([w[0] for w in movies['actors']])

# now we get the top performing actors by revenue
round(movies[['actors', 'revenue']].groupby('actors').sum()/sum(movies['revenue'])*100).sort_values('revenue', ascending = False)[0:6]
# Johnny Depp, Tom Hanks, Tom Cruise, Martin Freeman, Harrison Ford
def topactors(n):
    if n=='Johnny Depp':
        return 'Johnny Depp'
    elif n=='Tom Hanks':
        return 'Tom Hanks'
    elif n=='Tom Cruise':
        return 'Tom Cruise'
    elif n=='Martin Freeman':
        return 'Martin Freeman'
    elif n=='Harrison Ford':
        return 'Harrison Ford'
    else:
        return 'None'
movies['actors'] = pd.Series([topactors(w) for w in movies['actors']])


In [None]:
# now we handle the crew
# to be generalized across all crew values to get crew names only of:
# Producer
# Composer

movies['producer'] = pd.Series([re.findall('Producer", "name": "([\w ]+)"}', w) for w in movies.crew])
movies['producer'] = pd.Series([fillblank(w) for w in movies['producer']])
movies['producer'] = pd.Series([w[0] for w in movies['producer']])
movies['composer'] = pd.Series([re.findall('Composer", "name": "([\w ]+)"}', w) for w in movies.crew])
movies['composer'] = pd.Series([fillblank(w) for w in movies['composer']])
movies['composer'] = pd.Series([w[0] for w in movies['composer']])
movies = movies.drop('crew', axis = 1)

In [None]:
# DIRECTORS
round(movies[['producer', 'revenue']].groupby('producer').sum()/sum(movies['revenue'])*100).sort_values('revenue', ascending = False)[0:6]
def topprods(n):
    if n=='Stan Lee':
        return 'Stan Lee'
    elif n=='Steven Spielberg':
        return 'Steven Spielberg'
    elif n=='Peter Jackson':
        return 'Peter Jackson'
    elif n=='Charles Rovan':
        return 'Charles Rovan'
    elif n=='Jerry Bruckheimer':
        return 'Jerry Bruckheimer'
    else:
        return 'None'
movies['producer'] = pd.Series([topprods(w) for w in movies['producer']])

In [None]:
# COMPOSERS
# COMPOSER
round(movies[['composer', 'revenue']].groupby('composer').sum()/sum(movies['revenue'])*100).sort_values('revenue', ascending = False)[0:6]
def topcomps(n):
    if n=='John Williams':
        return 'John Williams'
    elif n=='Hans Zimmer':
        return 'Hans Zimmer'
    elif n=='Danny Elfman':
        return 'Danny Elfman'
    elif n=='John Powell':
        return 'John Powell'
    elif n=='James Newton Howard':
        return 'James Newton Howard'
    else:
        return 'None'
movies['composer'] = pd.Series([topcomps(w) for w in movies['composer']])

In [None]:
movies.columns

In [None]:
movies.info()

In [None]:
# left objects now: genres, keywords, 