In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sb

import plotly_express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("../input/imdb-extensive-dataset/IMDb movies.csv")
ratings=pd.read_csv("../input/imdb-extensive-dataset/IMDb ratings.csv",index_col='imdb_title_id')

In [None]:
df.head()

In [None]:
ratings.head()

In [None]:
df.info()

In [None]:
ratings.info()

In [None]:
df.describe()

In [None]:
ratings.describe()

In [None]:
df.year=df.year.replace('TV Movie 2019',2019).astype(int)
df['primary_genre']=df.genre.apply(lambda x:x.split(',')[0])
df['primary_language']=df.language.dropna().apply(lambda x:x.split(',')[0])
df['primary_country']=df.country.dropna().apply(lambda x:x.split(',')[0])

In [None]:
for var in ratings.columns[1:]:
    ratings[var]=ratings[var].fillna(value=ratings[var].median())

In [None]:
ratings.isnull().sum()

In [None]:
rat_trans=ratings.T
rat_trans.columns=df['original_title']
rat_trans

In [None]:
categ=[var for var in df.columns if df[var].dtype=='O']
numeric=[var for var in df.columns if df[var].dtype!='O']

In [None]:
base_categ=['primary_genre','primary_language','primary_country','year']

In [None]:
def visualise_plot_plt():  
    plt.style.use('ggplot')
    plt.figure(figsize=(20,50))
    for num,var in enumerate(base_categ):

        plt.subplot(len(base_categ),2,2*num+1)
        df[var].value_counts()[:15].plot(kind='pie',autopct='%1.1f%%',explode=np.ones(15)*0.1,rotatelabels=True,radius=0.8,shadow=True)
        plt.title("Number of Movies by {}".format(var.capitalize()),weight='bold')
        plt.tick_params(labelsize='x-large')
        plt.axis('off')

        plt.subplot(len(base_categ),2,2*(num+1))
        df[var].value_counts()[:15].plot(kind='bar',color='y',edgecolor='r',linewidth=3)
        plt.grid(True)
        plt.tick_params(grid_color='r',grid_linestyle='-.')

        plt.title("Number of Movies by {}".format(var.capitalize()),weight='bold')
        for k,val in enumerate(df[var].value_counts()[:15]):
            plt.text(x=k-0.33,y=val*1.02,s=val,weight='bold')




In [None]:
visualise_plot_plt()

In [None]:
def visualize_plotly():
    fig=make_subplots(rows=4,cols=2,
    specs=[[{"type":'xy'},{"type":'pie'}],[{"type":'xy'},{"type":'pie'}],[{"type":'xy'},{"type":'pie'}],[{"type":'xy'},{"type":'pie'}]],
    subplot_titles=['No.of Movies by Country','No.of Movies by Country','No.of Movies by Genre','No.of Movies by Genre','No.of Movies by Language','No.of Movies by Language','No.of Movies by Year','No.of Movies by Year'])

    fig.add_trace(go.Funnel(y=df.primary_country.value_counts()[:15].index,x=df.primary_country.value_counts()[:15].values),1,1)

    fig.add_trace(go.Pie(labels=df.primary_country.value_counts()[:15].index,values=df.primary_country.value_counts()[:15].values,hole=0.5),1,2)

    fig.add_trace(go.Funnel(y=df.primary_genre.value_counts()[:15].index,x=df.primary_genre.value_counts()[:15].values),2,1)

    fig.add_trace(go.Pie(labels=df.primary_genre.value_counts()[:15].index,values=df.primary_genre.value_counts()[:15].values,hole=0.5),2,2)

    fig.add_trace(go.Funnel(y=df.primary_language.value_counts()[:15].index,x=df.primary_language.value_counts()[:15].values),3,1)

    fig.add_trace(go.Pie(labels=df.primary_language.value_counts()[:15].index,values=df.primary_language.value_counts()[:15].values,hole=0.5),3,2)

    fig.add_trace(go.Funnel(y=df.year.value_counts()[:15].index,x=df.year.value_counts()[:15].values),4,1)

    fig.add_trace(go.Pie(labels=df.year.value_counts()[:15].index,values=df.year.value_counts()[:15].values,hole=0.5),4,2)


    fig.update_layout(width=1000,height=2000,showlegend=False)


    fig.show()





In [None]:
visualize_plotly()

In [None]:
def box_Ind_World():
    sb.set_style('white')
    plt.figure(figsize=(20,27))

    plt.subplot(3,2,1)
    
    sb.boxplot(x='primary_genre',y='avg_vote',data=df[df.primary_genre.isin(df.primary_genre.value_counts(ascending=False)[:5].index)],linewidth=3,saturation=10)
    plt.title("Genres by Average Vote (World)")
    plt.xlabel("Genres")
    plt.ylabel("Avg Vote")

    
    plt.subplot(3,2,2)
    
    sb.boxplot(x='primary_genre',y='avg_vote',data=df[df.primary_genre.isin(df[df.primary_country=='India'].primary_genre.value_counts(ascending=False)[:5].index)],linewidth=3,saturation=5)
    plt.title("Genres by Average Vote (India)")
    plt.xlabel("Genres")
    plt.ylabel("Avg Vote")
    
    
    
    plt.subplot(3,2,3)
    
    sb.boxplot(x='primary_language',y='avg_vote',data=df[df.primary_language.isin(df.primary_language.value_counts(ascending=False)[:5].index)],linewidth=3,saturation=5)
    plt.title("Languages by Average Vote (World)")
    plt.xlabel("Languages")
    plt.ylabel("Avg Vote")
    
    
    
    plt.subplot(3,2,4)
    
    sb.boxplot(x='primary_language',y='avg_vote',data=df[df.primary_language.isin(df[df.primary_country=='India'].primary_language.value_counts(ascending=False)[:5].index)],linewidth=3,saturation=5)
    plt.title("Languages by Average Vote (India)")
    plt.xlabel("Languages")
    plt.ylabel("Avg Vote")
    
    
    plt.subplot(3,2,5)
    sb.kdeplot(df.avg_vote,df.votes)
    plt.ylim(0,100000)
    plt.xlim(2,9)
    plt.title("Average vs Total Votes")
    plt.xlabel("Avg Vote")
    plt.ylabel("Total Votes")
    
    
        
    plt.subplot(3,2,6)
    sb.kdeplot(df[df.primary_country=='India'].avg_vote,df[df.primary_country=='India'].votes)
    plt.ylim(0,20000)
    plt.xlim(2,9)
    plt.title("Average vs Total Votes (India)")
    plt.xlabel("Avg Vote")
    plt.ylabel("Total Votes")

In [None]:
box_Ind_World()

In [None]:
plt.figure(figsize=(20,15))
sb.set(font_scale=1.5)
sb.scatterplot(df[df.country=='India'].avg_vote,df[df.country=='India'].votes,hue=df[df.country=='India'].primary_language.to_list())
plt.ylim(0,10000)
plt.legend(loc=1)
plt.title("Avg Rating vs Votes of Movies in India",fontsize='xx-large')
plt.xlabel("Avg Votes",fontsize='large')
plt.ylabel("Total Votes",fontsize='large')


In [None]:
px.funnel(df[df.country=='India'].primary_language.value_counts(),title="No of Movies by Language in India")

In [None]:
px.histogram(data_frame=df,y='primary_country',color='primary_genre',height=800,width=900,title='No. of Movies in Each Language by Genre')


In [None]:
px.histogram(data_frame=df[df.country=='India'],y='primary_language',color='primary_genre',height=768,width=900,title='No. of Movies in Each Language by Genre in India')

In [None]:
px.sunburst(df.fillna('None'),path=['primary_country','primary_language','primary_genre'],
            height=700,width=700,title='No. of Movies in Each Language by Genre')

In [None]:
px.sunburst(df[df.country=='India'].fillna('None'),path=['primary_language','primary_genre'],
            height=700,width=700,title='No. of Movies in Each Language by Genre in India')

In [None]:
px.scatter_geo(locationmode='country names',locations=df['primary_country'].value_counts().index,size=df['primary_country'].value_counts(),
projection='kavrayskiy7',width=1000,color=df['primary_country'].value_counts().index)

In [None]:
px.choropleth(locationmode='country names',locations=df['primary_country'].value_counts().index,
projection='kavrayskiy7',width=1000,color=df.groupby('primary_country')['avg_vote'].median())

In [None]:
# SELECTED NUMBER OF COUNTRIES
C=['India','USA','UK','France','Japan','Spain','Germany','Mexico']

In [None]:
px.scatter_polar(data_frame=df[df.primary_country.isin(C)],theta='primary_language',r='avg_vote',color='primary_genre',symbol='primary_genre',animation_frame='primary_country')

In [None]:
def get_recommendations_for(title,n=10):
    if (len(title.split())==1):
        title=title.capitalize()
    else:
        pass
    
    top=rat_trans.corrwith(rat_trans[title]).sort_values(ascending=False)[1:n+1]
    print('='*125)
    print('='*125)
    print(f" Name: {df[df.original_title==title]['original_title'].values[0]}")
    print(f" Cast: {df[df.original_title==title]['actors'].values[0]}")
    print(f" Language: {df[df.original_title==title]['language'].values[0]}")
    print(f" Genre: {df[df.original_title==title]['genre'].values[0]}")
    print(f" Year: {df[df.original_title==title]['year'].values[0]}")
    print(f" Description: {df[df.original_title==title]['description'].values[0]}")
    print('='*125)
    print('*'*125)
    
    for t in top.index:
        
        print('='*125)
        print('|' + ' '*123+'|')
        
        print(f"| Movie Name: {df[df.original_title==t]['original_title'].values[0]}")
        print(f"| Director: {df[df.original_title==t]['director'].values[0]}")
        print(f"| Cast: {df[df.original_title==t]['actors'].values[0]}")
        print(f"| Language: {df[df.original_title==t]['language'].values[0]}")
        print(f"| Genre: {df[df.original_title==t]['genre'].values[0]}")
        print(f"| Year: {df[df.original_title==t]['year'].values[0]}")
        print(f"| Country: {df[df.original_title==t]['country'].values[0]}")
        print(f"| Description: {df[df.original_title==t]['description'].values[0]}")
        print(f"| Rating: {df[df.original_title==t]['avg_vote'].values[0]}")
        print(f"| Total Votes: {df[df.original_title==t]['votes'].values[0]}")
    
        print('|' + ' '*123+'|')
        print('='*125)
        print('-'*125)
    
    plt.figure(figsize=(15,n))
    plt.tick_params()
    top.plot(kind='barh')
    plt.ylabel("Movie Name")
    plt.title("Correlation Score",fontsize='xx-large')
    for N,t in enumerate(top):
        plt.text(x=t+0.01,y=N-0.1,s=round(t,3)*100)

In [None]:
get_recommendations_for('Wonder Woman')