In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import cufflinks as cf
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
from scipy.stats import pearsonr
import gc

Importing IMDb movie data

In [None]:
movies_df= pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv')
movies_df

In [None]:
movies_df.isna().sum()

In [None]:
movies_df.info()

In [None]:
movies_df['year']=movies_df['year'].replace('[TV Movie  ]','', regex=True).astype(int)

In [None]:
def decade(num, divisor):
    return num - (num%divisor)
dec=10
movies_df['Decade']=movies_df['year'].apply(decade, divisor=dec)

In [None]:
movies_df["country"]=movies_df["country"].str.split(", ", n = 4, expand = True)

Grouping data by countries to find wihich country makes highest number of movies.

In [None]:
movies_cntry=movies_df.groupby(['country']).agg(Count=('title', 'count'))
xyz=movies_cntry.sort_values(by='Count', ascending=False).head(25)
xyz

In [None]:
fig=px.bar(xyz, y='Count',text='Count')
fig.update_layout(title_text='Top 25 Movie making Countries', title_x=0.5, showlegend=True)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8)
fig.update_layout(xaxis_tickangle=-45)

USA makes highest number of movies followed by India and UK.

In [None]:
movies_df['worlwide_gross_income']=movies_df['worlwide_gross_income'].replace('[$INRKGBP ]','', regex=True).astype(float)
movies_df['year']=movies_df['year'].replace('[TV Movie  ]','', regex=True).astype(int)

In [None]:
title=movies_df['title']
worlwide_gross_income=movies_df['worlwide_gross_income']
year=movies_df['year']
top_money= pd.concat([title, worlwide_gross_income,year], axis='columns', sort=False)
top_money

Trying to understand which movie made highest money.

In [None]:
top_money.sort_values(by= 'worlwide_gross_income', ascending = False, inplace = True)
toptop_money=top_money.head(25)
toptop_money

In [None]:
fig1=px.bar(toptop_money,x='title', y='worlwide_gross_income', text='year')
fig1.update_layout(title_text='Top 25 Money making Movies Worldwide', title_x=0.5, showlegend=True)
fig1.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8)
fig1.update_layout(xaxis_tickangle=-45)

Clearly Marvel leads the way with awesome Avengers movies followed by first 3D movie Avatar.

In [None]:
movies_df['language']=movies_df['language'].str.split(", ", n = 4, expand = True)

In [None]:
lang=movies_df['language']
lang_money= pd.concat([title, worlwide_gross_income,lang], axis='columns', sort=False)
lang_money

In [None]:
lang_money.sort_values(by= 'worlwide_gross_income', ascending = False, inplace = True)
lang_money=lang_money.groupby(['language']).sum()
lang_money1=lang_money.sort_values(by = 'worlwide_gross_income', ascending=False).head(25)
lang_money1

In [None]:
fig2=px.bar(lang_money1, y='worlwide_gross_income', text='worlwide_gross_income')
fig2.update_layout(title_text='Worldwide gross income for Each Language', title_x=0.5, showlegend=True)
fig2.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig2.update_layout(uniformtext_minsize=8)
fig2.update_layout(xaxis_tickangle=-45)

English makes highest money followed by Mandarin

In [None]:
movie_languages=movies_df.groupby(['language']).agg( Count = ('language','count'))
movie_languages=movie_languages.sort_values(by='Count', ascending=False).head(25)
movie_languages

In [None]:
fig3=px.bar(movie_languages, y='Count',text='Count', title='Top 25 Movie making Languages')
fig3.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig3.update_layout(uniformtext_minsize=8)
fig3.update_layout(xaxis_tickangle=-45)

In [None]:
fig4=px.line(movies_df.groupby('year').size())
fig4.update_layout(title_text='Total Number of Movies Per Year', title_x=0.5, showlegend=False)

In [None]:
ratings_df=pd.read_csv('../input/imdb-extensive-dataset/IMDb ratings.csv')
ratings_df

In [None]:
ratings_df.isna().sum()

We can find 'allgenders_0age_avg_vote', 'allgenders_0age_votes', 'males_0age_avg_vote', 'males_0age_votes', 'females_0age_avg_vote', 'females_0age_votes' have a lot of null values, they are droped

In [None]:
ratings_df=ratings_df.drop(columns=['allgenders_0age_avg_vote', 'allgenders_0age_votes', 'males_0age_avg_vote', 'males_0age_votes', 'females_0age_avg_vote', 'females_0age_votes'])

In [None]:
df=movies_df.set_index('imdb_title_id').join(ratings_df.set_index('imdb_title_id'))
df

In [None]:
pea=pearsonr(df['weighted_average_vote'], df['mean_vote'])
pea=round(pea[0], 3)
pea

In [None]:
fig5=px.scatter(df, x='weighted_average_vote', y='mean_vote', trendline='ols')
fig5.update_layout(title_text='Weighted Average Vote vs. Mean Vote, Corr: 0.928', title_x=0.5)

From IMDb webite weighted average is the best way understand average rating of any movie, henceforth weigthed average is going to be used.

In [None]:
fig6=px.line(df.groupby('year').sum(), y='total_votes')
fig6.update_layout(title_text='Total Number of Votes for Movies Each Year', title_x=0.5, showlegend=True)

In [None]:
movie_languages1=df.groupby(['language']).size().sort_values(ascending=False)>=500
lang_df1 = df.groupby('language').mean().loc[movie_languages1].sort_values("weighted_average_vote", ascending=False)
lang_df1

In [None]:
fig7=px.bar(lang_df1, y='weighted_average_vote', text='weighted_average_vote')
fig7.update_layout(title_text='Average Weighted Rating for Each Language', title_x=0.5, showlegend=True)
fig7.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig7.update_layout(uniformtext_minsize=8)
fig7.update_layout(xaxis_tickangle=-45)

In [None]:
fig8=px.scatter(df,  x='weighted_average_vote', y='worlwide_gross_income')
fig8.update_layout(title_text='Average Weighted Rating vs. Gross income worldwide', title_x=0.5)

In [None]:
fig9 = px.scatter(df, x="duration", y="weighted_average_vote")
fig9.update_layout(title_text='Duration vs Average Rating', title_x=0.5)

In [None]:
fig10 = px.scatter(df, x="duration", y="worlwide_gross_income")
fig10.update_layout(title_text='Duration vs Gross income Worldwide', title_x=0.5)

This EDA is done to explore Plotly. If any suggestion for imporvements please comment below. Thank you.