The TMDB dataset contains arround 5000 movies and TV Series data. This notebook aims at analysing the data, finding reationship between data and fitting a model if there is strong relationship between any of the movies attributes.

---

This notebook is organised as follows:
<ol>
<li> Import Packages
<li> Import Data
<li> Exploration
<li> Merging Datasets
<li> Cleaning Data
<li> Converting JSON to DataFrame
<li> Data Visualization
<li> Correlation between the Variables
</ol>

## Import Packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json
import matplotlib
from collections import Counter

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Import Data

In [None]:
credits_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

## Explore Data

In [None]:
credits_df.head(2)

In [None]:
movies_df.head(2)

In [None]:
print(credits_df.shape)
print(credits_df.columns)

In [None]:
print(movies_df.shape)
print(movies_df.columns)

In [None]:
movies_df.describe()

## Merge Movies and Credits Dataset

In [None]:
cm_df = movies_df.merge(credits_df, how = 'left', left_on = 'id', right_on= 'movie_id')

In [None]:
cm_df.drop(columns= ['original_title', 'title_x', 'movie_id'], inplace = True)

In [None]:
cm_df.columns

## Rename the Columns

In [None]:
cm_df.rename(columns= {'title_y':'Title',
                      'budget' : 'Budget',
                      'genres' : 'Genres',
                      'homepage': 'Website',
                      'id': 'Id',
                      'keywords' : 'Keywords',
                      'original_language': 'Language',
                      'overview': 'Overview',
                      'popularity': 'Popularity',
                      'production_companies': 'Prod_Company',
                      'production_countries':'Prod_Country',
                      'release_date': 'Release_Date',
                      'revenue':'Revenue',
                      'runtime': 'Duration',
                      'spoken_languages': 'Spoken_Language',
                      'status': 'Status',
                      'tagline':'Tagline',
                      'vote_average':'Average_Vote',
                      'vote_count':'Vote_Count',
                      'cast':'Cast',
                      'crew':'Crew'}, inplace = True)

## Get the year data from Release Date

In [None]:
cm_df['Release_Date'] = pd.to_datetime(cm_df['Release_Date'])
cm_df['Release_Year'] = cm_df['Release_Date'].apply(lambda x: x.year)

## Converting JSON

In [None]:
cm_df['Genres'] = cm_df['Genres'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Genres']):
    genres_list = []
    for j in range(len(i)):
        genres_list.append(i[j]['name'])
    cm_df.loc[index, 'Genres'] = str(genres_list)

In [None]:
cm_df['Keywords'] = cm_df['Keywords'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Keywords']):
    keywords_list = []
    for j in range(len(i)):
        keywords_list.append(i[j]['name'])
    cm_df.loc[index , 'Keywords'] = str(keywords_list)

In [None]:
cm_df['Prod_Company'] = cm_df['Prod_Company'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Prod_Company']):
    prod_comp_list = []
    for j in range(len(i)):
        prod_comp_list.append(i[j]['name'])
    cm_df.loc[index , 'Prod_Company'] = str(prod_comp_list)

In [None]:
cm_df['Prod_Country'] = cm_df['Prod_Country'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Prod_Country']):
    prod_comp_list = []
    for j in range(len(i)):
        prod_comp_list.append(i[j]['name'])
    cm_df.loc[index , 'Prod_Country'] = str(prod_comp_list)

In [None]:
cm_df['Spoken_Language'] = cm_df['Spoken_Language'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Spoken_Language']):
    prod_comp_list = []
    for j in range(len(i)):
        prod_comp_list.append(i[j]['name'])
    cm_df.loc[index , 'Spoken_Language'] = str(prod_comp_list)

In [None]:
cm_df['Cast'] = cm_df['Cast'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Cast']):
    cast_name_list = []
    cast_character_list = []
    cast_gender_list = []
    for j in range(len(i)):
        cast_name_list.append(i[j]['name'])
        cast_character_list.append(i[j]['character'])
        cast_gender_list.append(i[j]['gender'])
    cm_df.loc[index, 'Caste_Name'] = str(cast_name_list)
    cm_df.loc[index, 'Caste_Char'] = str(cast_character_list)
    cm_df.loc[index, 'Caste_Gender'] = str(cast_gender_list)

In [None]:
cm_df['Crew']  =cm_df['Crew'].apply(json.loads)

In [None]:
for index, i in zip(cm_df.index, cm_df['Crew']):
    crew_dept_list = []
    crew_gender_list = []
    crew_job_list = []
    crew_name_list = []
    for j in range(len(i)):
        crew_dept_list.append(i[j]['department'])
        crew_gender_list.append(i[j]['gender'])
        crew_job_list.append(i[j]['job'])
        crew_name_list.append(i[j]['name'])
    cm_df.loc[index, 'Crew_Dept'] = str(crew_dept_list)
    cm_df.loc[index, 'Crew_Gender'] = str(crew_gender_list)
    cm_df.loc[index, 'Crew_Job'] = str(crew_job_list)
    cm_df.loc[index, 'Crew_Name'] = str(crew_name_list)

In [None]:
cm_df['Genres'] = cm_df['Genres'].str.strip('[]').str.replace("'", "").str.replace(' ','').str.split(',')

In [None]:
genres_list = []
for i in cm_df['Genres']:
    for j in i:
        genres_list.append(j)

## Most prominent Genres

In [None]:
plt.figure(figsize = (10,20))
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=0.5, colormap= matplotlib.cm.viridis_r).generate(' '.join(genres for genres in genres_list))
plt.title('Types of Movies', fontdict = {'size':20, 'weight':'bold'})
plt.imshow(wordcloud, interpolation ='bilinear')
plt.tight_layout(pad = 0)
plt.axis('off')
plt.show()

## Frequent Tags in each Movie

In [None]:
cm_df['Keywords'] = cm_df['Keywords'].str.strip('[]').str.replace("'","").str.replace(' ','').str.split(',')

In [None]:
keywords_list = []
for i in cm_df['Keywords']:
    for j in i:
        keywords_list.append(j)

In [None]:
plt.figure(figsize = (10,20))
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=1, colormap = matplotlib.cm.viridis_r).generate(' '.join(keywords for keywords in keywords_list))
plt.imshow(wordcloud, interpolation ='bilinear',  aspect = "equal")
plt.title('Leading keywords in movies', fontdict = {'size':20, 'weight':'bold'})
plt.tight_layout(pad = 0)
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,10))
ax = sns.countplot(x = keywords_list, order = pd.Series([keyword for keyword in keywords_list if keyword != '' ]).value_counts().iloc[:40].index, palette= 'viridis')
ax.set_title('Keywords Count', fontdict = {'size':'20', 'weight':'bold'})
ax.set_xlabel('Keywords', fontdict = {'size':'15', 'weight':'bold'})
ax.set_ylabel('Count', fontdict = {'size':'15', 'weight':'bold'})
ax.grid(color = 'black')
plt.xticks(rotation = 90)
plt.tick_params(labelsize = 15)
plt.show()

## Production Countries

In [None]:
cm_df['Prod_Country'] = cm_df['Prod_Country'].str.strip('[]').str.replace("'","").str.replace(' ','').str.split(',')

In [None]:
country_list = []
for i in cm_df['Prod_Country']:
    for j in i:
        country_list.append(j)

In [None]:
plt.figure(figsize = (10,20))
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=0.2, colormap = matplotlib.cm.magma).generate(' '.join(country for country in country_list))
plt.imshow(wordcloud, interpolation ='bilinear',  aspect = "equal")
plt.title('Countries in which movie is produced', fontdict = {'size':20, 'weight':'bold'})
plt.tight_layout(pad = 0)
plt.axis('off')
plt.show()

## Star Names playing role in most of the movies

In [None]:
cm_df['Caste_Name'] = cm_df['Caste_Name'].str.strip('[]').str.replace("'","").str.replace(' ','').str.split(',')

In [None]:
cm_df['Caste_Gender'] = cm_df['Caste_Gender'].str.strip('[]').str.replace("'","").str.replace(' ','').str.split(',')

In [None]:
cast_name_list = []
for i in cm_df['Caste_Name']:
    for j in i:
        cast_name_list.append(j)

In [None]:
plt.figure(figsize = (10,20))
stopwords = ['Jean']
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=0.2, colormap = matplotlib.cm.magma,stopwords = stopwords ).generate(' '.join(str(name).replace(' ','') for name in cast_name_list))
plt.imshow(wordcloud, interpolation ='bilinear',  aspect = "equal")
plt.title('Stars who were played a role in most of the movies', fontdict = {'size':20, 'weight':'bold'})
plt.tight_layout(pad = 0)
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,10))
ax = sns.countplot(x = cast_name_list, order = pd.Series([name for name in cast_name_list if name != '' ]).value_counts().iloc[:40].index, palette= 'magma_r')
ax.set_title('Cast Count', fontdict = {'size':'20', 'weight':'bold'})
ax.set_xlabel('Cast', fontdict = {'size':'15', 'weight':'bold'})
ax.set_ylabel('Count', fontdict = {'size':'15', 'weight':'bold'})
ax.grid(color = 'black')
plt.xticks(rotation = 90)
plt.tick_params(labelsize = 15)
plt.show()

## Male Actors playing lead role in most movies

In [None]:
name_df= pd.DataFrame(cm_df.loc[:,['Id', 'Caste_Name']])

name_df= name_df['Caste_Name']\
        .apply(pd.Series)\
        .merge(name_df, left_index = True, right_index = True)\
        .drop(['Caste_Name'], axis = 1)\
        .melt(id_vars = ['Id'], value_name = 'Cast_Name')\
        .drop('variable', axis = 1)\
        .dropna()
name_df.sort_values(['Id']).head(10)

In [None]:
gender_df= pd.DataFrame(cm_df.loc[:,['Id','Caste_Gender']])
# gender_df.head()
gender_df= gender_df['Caste_Gender']\
        .apply(pd.Series)\
        .merge(gender_df, left_index = True, right_index = True)\
        .drop(['Caste_Gender'], axis = 1)\
        .melt(id_vars = ['Id'], value_name = 'Cast_Gender')\
        .drop('variable', axis = 1)\
        .dropna()

In [None]:
cast_name_gender = name_df.merge(gender_df, left_index = True, right_index = True)\
                            .drop(['Id_y'],axis = 1)\
                            .rename(columns = {'Id_x':'Id'})

In [None]:
plt.figure(figsize = (10,20))
stopwords = ['Jean']
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=0.2, colormap = matplotlib.cm.magma,stopwords = stopwords )\
            .generate(' '.join(male for male in list(cast_name_gender[cast_name_gender['Cast_Gender']=='2']['Cast_Name'])))
plt.title('Male Actors', fontdict = {'size':'20','weight':'bold'})
plt.title('Male Actors playing role in most of the movies', fontdict = {'size':20, 'weight':'bold'})
plt.tight_layout(pad = 0)
plt.imshow(wordcloud, interpolation ='bilinear',  aspect = "equal")
plt.axis('off')
plt.show()

## Names of Female Actors

In [None]:
plt.figure(figsize = (10,20))
stopwords = ['Jean']
wordcloud = WordCloud(background_color= 'white', max_font_size= 500, collocations= False, relative_scaling=0.2, colormap = matplotlib.cm.magma,stopwords = stopwords )\
            .generate(' '.join(female for female in list(cast_name_gender[cast_name_gender['Cast_Gender'] =='1']['Cast_Name'])))
plt.title('Female Actors', fontdict = {'size':'20','weight':'bold'})
plt.imshow(wordcloud, interpolation ='bilinear',  aspect = "equal")
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,10))
ax = sns.countplot(x = cast_name_gender.loc[(cast_name_gender['Cast_Gender'] == '1'), 'Cast_Name'],
                   order = pd.Series([female for female in cast_name_gender.loc[(cast_name_gender['Cast_Gender'] == '1'), 'Cast_Name'] if female != '' ])\
                                                                           .value_counts()\
                                                                            .iloc[:40].index, palette= 'magma_r')
ax.set_title('Female Actors Count', fontdict = {'size':'20', 'weight':'bold'})
ax.set_xlabel('Female Actors', fontdict = {'size':'15', 'weight':'bold'})
ax.set_ylabel('Count', fontdict = {'size':'15', 'weight':'bold'})
ax.grid(color = 'black')
plt.xticks(rotation = 90)
plt.tick_params(labelsize = 15)
plt.show()

## Correlation between variables

In [None]:
corr = cm_df.drop(['Id', 'Average_Vote','Duration'], axis = 1).corr()

In [None]:
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,
            mask = mask,
            cmap= matplotlib.cm.magma_r,
            square= True,
            annot= True)

## Scatterplot between Revenue and Vote Count

In [None]:
plt.figure(figsize = (10,10))
ax = sns.scatterplot(x= cm_df[cm_df['Revenue']>800000000]['Revenue'], y = cm_df[cm_df['Vote_Count']>8000]['Vote_Count'], alpha= 0.7, color = 'red')
ax = sns.scatterplot(x= cm_df[cm_df['Revenue']<800000000]['Revenue'], y = cm_df[cm_df['Vote_Count']<8000]['Vote_Count'], alpha= 0.7, color = 'blue')
ax.set_title('Finding outliers in Revenue and VoteCount',fontdict = {'size':20, 'weight':'bold'})
ax.set_xlabel('Revenue',fontdict = {'size':15, 'weight':'bold'})
ax.set_ylabel('Number of Votes',fontdict = {'size':15, 'weight' : 'bold'})

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X = cm_df.loc[:,['Revenue', 'Vote_Count']].values

In [None]:
kmeans = KMeans(n_clusters = 4, max_iter= 1500, n_init= 100, algorithm= 'auto')
label = kmeans.fit_predict(X)

In [None]:
plt.figure(figsize = (10,10))
ax = sns.scatterplot(x = X[label==0, 0], y = X[label == 0,1], label = '1')
ax = sns.scatterplot(x = X[label==1, 0], y = X[label == 1,1], label = '2')
ax = sns.scatterplot(x = X[label==2, 0], y = X[label == 2,1], label = '3')
ax = sns.scatterplot(x = X[label==3, 0], y = X[label == 3,1], label = '4')
sns.scatterplot(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1], s=200, label = 'Centroids')
ax.set_title('Finding clusters in Revenue and VoteCount using KMeans',fontdict = {'size':20, 'weight':'bold'})
ax.set_xlabel('Revenue',fontdict = {'size':15, 'weight':'bold'})
ax.set_ylabel('Number of Votes',fontdict = {'size':15, 'weight' : 'bold'})

In [None]:
cm_df.head(2)

In [None]:
cm_df.to_csv(r'../output/mysub.csv')