In [None]:
### EDA on TMDB analysis
#### Reference :: https://towardsdatascience.com/hitchhikers-guide-to-exploratory-data-analysis-6e8d896d3f7e

import numpy as np
import pandas as pd
import seaborn as sns
import json

from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Loading dataset onto the frames

credits_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
credits_df.head()

In [None]:
movies_df.head()

In [None]:
# removing columns (axis=1) with following features

del_col_list = ['homepage','keywords','original_language','original_title','overview','production_companies','status','tagline']

movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()

In [None]:
# before removing duplicates the size of df
print(movies_df.shape)

movies_df = movies_df.drop_duplicates(keep='first')
print(movies_df.shape)

In [None]:
# removing nan and 0 values in dataset in revenue and budgets

cols = ['budget','revenue']
movies_df[cols] = movies_df[cols].replace(0,np.nan)

# subset=cols implies which cols to look for na
# inplace=True implies we want to update current df and return nothing rather than a copy of new object

movies_df.dropna(subset=cols, inplace=True)
movies_df.shape

In [None]:
# Changing the release_date column to DateTime column and extracting year

movies_df.release_date = pd.to_datetime(movies_df['release_date'],format='%Y-%m-%d')
movies_df['release_year'] = pd.DatetimeIndex(movies_df.release_date).year
movies_df.head()

In [None]:
# Changing the data type of the below mentioned columns and 
change_cols=['budget', 'revenue']

#changing data type
movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
movies_df.dtypes

In [None]:
# Parsing jsons 

# zip index and column (converted from json to dictionary using json.loads)
# for every dictionary in dictionary array we find value of key given and append to list
# finally we convert it to string and store in that column

def parse_col_json(column,key):

    for index,i in zip(movies_df.index,movies_df[column].apply(json.loads)):
        list1=[]
        for j in range(len(i)):
            list1.append((i[j][key]))# the key 'name' contains the name of the genre
        movies_df.loc[index,column]=str(list1)

In [None]:
parse_col_json('genres', 'name')
parse_col_json('spoken_languages', 'name')
parse_col_json('production_countries', 'name')

movies_df.head()

In [None]:
# Answering the following questions

# Which English movie has the highest votes?
# What are the 5 most expensive movies?
# Which movie had the highest profit?
# Which movie covers the largest vareity of languages?
# Which movie was the most popular before 2010?
# What is the average runtime of movies?
# Movies having the highest rating

In [None]:
# Which english movie has the highest votes?

# Identify english movies

english_movies = movies_df[movies_df["spoken_languages"].str.contains("English")]
max_vote_index = english_movies["vote_count"].idxmax()
english_movies.loc[max_vote_index]

In [None]:
# What are 5 most expensive movies?

expensive_movies_df = movies_df.sort_values(by='budget',ascending=False).head()
expensive_movies_df

In [None]:
# Which movie has highest profit?

movies_df["profit"] = movies_df["revenue"]- movies_df["budget"]
highest_profit_movie = movies_df.sort_values(by='profit',ascending=False)
highest_profit_movie.iloc[0]

In [None]:
# Which movie covers the largest vareity of languages?

languages_index_max = movies_df["spoken_languages"].str.len().idxmax()
movies_df.loc[languages_index_max]

In [None]:
# Which movie was the most popular before 2010?

popular_movie_2010_idx = movies_df[movies_df["release_year"]<2010]["popularity"].idxmax()
movies_df.loc[popular_movie_2010_idx]

In [None]:
# What is the average runtime of movies?

movies_df["runtime"].mean()

In [None]:
# Movie having the highest rating

top_rated_movies = movies_df.sort_values(by='vote_average',ascending=False)
top_rated_movies.iloc[0]

## Plots

In [None]:
# finding the total profits yearwise

profits_per_year = movies_df.groupby("release_year")["profit"].sum()

plt.figure(figsize=(12,6), dpi=130) # dot per inch = 130
plt.plot(profits_per_year)
plt.xlabel("Release year of various movies")
plt.ylabel("Total profit")
plt.title("Total profits earned by movies vs Release year")
plt.show()

In [None]:
# year having max profits
# can be verified from graph

profits_per_year.idxmax()