# VISUAL ANALYSIS OF THE MOVIES DATASET

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movies = pd.read_csv("../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")
df = pd.DataFrame(movies)
df.head()

## CLEANING DATA

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
col = ['Unnamed: 0', 'Rotten Tomatoes','Directors']
df.drop(col,axis=1,inplace=True)

In [None]:
df['Age'] = df['Age'].str.replace('+','')
df['Age'] = df['Age'].str.replace('all','0')

In [None]:
median = df['Age'].median()
df['Age'] = df['Age'].fillna(median)

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Runtime'] = df['Runtime'].astype(int)
df['Age'] = df['Age'].astype(int)

In [None]:
df.Type.unique()

In [None]:
df.columns

In [None]:
df.drop('Type',axis=1,inplace=True)

In [None]:
df.shape

## VISUALISING IMDb RATINGS

In [None]:
df1 = df.groupby('IMDb')["Title"].count().reset_index()

In [None]:
#Function that helps in displaying content on my bar graph.
def data_graph (axis, width, height):
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)
    for p in axis.patches:
        axis.annotate ("{0:.0f}".format(p.get_height()), (p.get_x()+width, p.get_height()+height))

In [None]:
plt.figure(figsize = (25,10))
ax = sns.barplot(x="IMDb", y= "Title", data=df1 ,palette = "Spectral")
plt.title ("No. of Movies of specific IMDb Ratings")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("IMDb Rating")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()

In [None]:
plt.figure(figsize = (25,10))
x = pd.Series(df["IMDb"])
ax = sns.distplot(x, color = 'r')
plt.title ("IMDb Distribution")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("IMDb Rating")
plt.ylabel("Desnsity")
plt.show()

## VISUALISING MOVIES ALONG THE YEARS

In [None]:
df2 = df.groupby('Year')["Title"].count().reset_index()

In [None]:
plt.figure(figsize = (25,20))
ax = sns.barplot(x="Year", y= "Title", data=df2 ,palette = "magma")
plt.title ("No. of Movies along the Years")
plt.xticks(rotation = 90, ha = 'right')
plt.xlabel("Year")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()

In [None]:
plt.figure(figsize = (25,10))
x = pd.Series(df["Year"])
ax = sns.distplot(x, color = 'y')
plt.title ("Year wise distribution")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Year")
plt.ylabel("Desnsity")
plt.show()

In [None]:
def split_mulcol(x):
    res = x.to_frame()
    opt = []
    
    for i, j  in x[x.notnull()].iteritems():
        
        for k in j.split(','):
            
            if not k in res.columns:
                opt.append(k)
                res[k] = False
            
            res.at[i, k] = True
    
    return res[opt]

## VISUALISING LANGUAGE DISTRIBUTION OF THE MOVIES

In [None]:
lang = split_mulcol(df["Language"])
lang = lang.sum().sort_values(ascending=False).reset_index()
lang = pd.DataFrame(lang.values, columns=['Language',"No. of Movies"])

In [None]:
plt.figure(figsize = (15,5))
lang = lang[:20]
ax = sns.barplot(x=lang["Language"], y= lang["No. of Movies"], data=lang ,palette = "coolwarm")
plt.title ("Top 20 languages for movies")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Language")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()

## VISUALISING THE GENRE DISTRIBUTION

In [None]:
genres = split_mulcol(df["Genres"])
genres = genres.sum().sort_values(ascending=False).reset_index()
genres = pd.DataFrame(genres.values, columns=['Genre',"No. of Movies"])
genres = genres[:20]

In [None]:
plt.figure(figsize = (15,5))
ax = sns.barplot(x=genres["Genre"], y= genres["No. of Movies"], data=genres , palette = "rocket")
plt.title ("Distribution among the Genres")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Genres")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()

## VISUALISING THE DISTRIBUTION AMONGST THE PLATFORMS

In [None]:
plat_dist = df[["Netflix","Hulu","Prime Video","Disney+"]].sum().sort_values(ascending=False).reset_index()
plat_dist = pd.DataFrame(plat_dist.values, columns=['Platform',"No. of Movies"])

In [None]:
plt.figure(figsize=(7,7))
dt = plat_dist["No. of Movies"]
labels = plat_dist["Platform"]
exp = (0.1,0.1,0.1,0.1)
clr = ( "brown", "green", "cyan", "yellow")
wp = { 'linewidth' : 2, 'edgecolor' : "black" }
plt.pie(dt,labels=labels,autopct='%1.1f%%',explode=exp, colors=clr,shadow=True, wedgeprops=wp)
plt.title('Distribution of Movies among the Platforms')
plt.axis('equal')
plt.show()

In [None]:
# mpppy => movies per platform per year
mpppy = df.groupby('Year')[['Netflix','Hulu','Prime Video','Disney+']].sum()
plt.figure(figsize = (16, 8))
sns.lineplot(x=mpppy.index,y=mpppy['Netflix'], color='red', label='Netflix')
sns.lineplot(x=mpppy.index,y=mpppy['Hulu'], color = 'green', label='Hulu')
sns.lineplot(x=mpppy.index,y=mpppy['Prime Video'], color = 'blue', label='Prime Videos')
sns.lineplot(x=mpppy.index,y=mpppy['Disney+'], color = 'black', label='Disney+')
plt.title("Movies per platform of particular years")
plt.legend()
plt.xlabel('Release Year')
plt.ylabel('No. of Movies')
plt.show()

## VISUALISING ON THE BASIS OF RUNTIME

In [None]:
df_run = df[['Title','Runtime']].sort_values('Runtime',ascending=False)
df_run = df_run[:50]

In [None]:
plt.figure(figsize = (20,15))
ax = sns.barplot(y=df_run["Title"], x= df_run["Runtime"], data=df_run , palette = "mako")
plt.title ("Top 50 movies with highest runtime")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Runtime (in minutes)")
plt.ylabel("Title")
for i, j in enumerate(df_run["Runtime"]):
    ax.text(j, i + .25, str(j), color = 'red', fontweight = 'bold')
plt.show()

## VISUALISING ON THE BASIS OF COUNTRY

In [None]:
cont = split_mulcol(df["Country"])
cont = cont.sum().sort_values(ascending=False).reset_index()
cont = pd.DataFrame(cont.values, columns=['Country',"No. of Movies"])
cont = cont[:30]

In [None]:
plt.figure(figsize = (15,5))
ax = sns.barplot(x=cont["Country"], y= cont["No. of Movies"], data=cont , palette = "viridis")
plt.title ("Top 30 countries with the most releases")
plt.xticks(rotation = 74, ha = 'right')
plt.xlabel("Country")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()

## VISUALISING THE AGE CATEGORY DISTRIBUTION

In [None]:
df_age = df.Age.value_counts().reset_index()
df_age = pd.DataFrame(df_age.values, columns = ['Age', "No. of Movies"]).sort_values('No. of Movies',ascending=False)

In [None]:
plt.figure(figsize = (15,5))
ax = sns.barplot(x=df_age["Age"], y= df_age["No. of Movies"], data=df , palette = "rocket")
plt.title ("No. of Movies for the minimum age")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Min age")
plt.ylabel("No. of Movies")
data_graph(ax,0.08,5)
plt.show()