In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")

In [None]:
df.head()

In [None]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()/len(df)*100

**There are lots of nan values on Rotten Tomatoes. That's why I'll do most of eda according to IMDb.**

In [None]:
# Best Movies According to Rotten Tomatoes
df.sort_values("Rotten Tomatoes",ascending=False).head(20)

In [None]:
# Number of Unique Director in Dataset
df["Directors"].nunique()

In [None]:
# In order to do some EDA I want to divide dataset. 
df_netflix=df[df["Netflix"]==1]
df_hulu=df[df["Hulu"]==1]
df_prime=df[df["Prime Video"]==1]
df_disney=df[df["Disney+"]==1]

print(df_disney.shape)
print(df_hulu.shape)
print(df_netflix.shape)
print(df_prime.shape)
print(df.shape)

In [None]:
df_netflix.drop(['Hulu', 'Prime Video', 'Disney+'], axis = 1,inplace=True)
df_disney.drop(['Hulu', 'Prime Video', 'Netflix'], axis = 1,inplace=True)
df_hulu.drop(['Netflix', 'Prime Video', 'Disney+'], axis = 1,inplace=True)
df_prime.drop(['Hulu', 'Netflix', 'Disney+'], axis = 1,inplace=True)

In [None]:
df_netflix.head()

### Data Visualization

In [None]:
# Each platforms highest IMDbs
plt.figure(figsize=(12,8))
sns.barplot(y="Title",x="IMDb",data=df_disney.sort_values("IMDb",ascending=False)[:15])
plt.title("Best Movies on Disney+")
plt.show()

In [None]:
# Each platforms highest IMDbs
plt.figure(figsize=(12,8))
sns.barplot(y="Title",x="IMDb",data=df_hulu.sort_values("IMDb",ascending=False)[:15])
plt.title("Best Movies on Hulu")
plt.show();

In [None]:
# Each platforms highest IMDbs
plt.figure(figsize=(12,8))
sns.barplot(y="Title",x="IMDb",data=df_netflix.sort_values("IMDb",ascending=False)[:15])
plt.title("Best Movies on Netflix")
plt.show()

In [None]:
# Each platforms highest IMDbs
plt.figure(figsize=(12,8))
sns.barplot(y="Title",x="IMDb",data=df_prime.sort_values("IMDb",ascending=False)[:15])
plt.title("Best Movies on Prime")
plt.show()

In [None]:
# Counts of movies
count_prime=len(df_prime)
count_hulu=len(df_hulu)
count_netflix=len(df_netflix)
count_disney=len(df_disney)
names=["Prime", "Hulu", "Netflix", "Disney+"]
values=[count_prime, count_hulu, count_netflix, count_disney]
fig=px.pie(names=names, values=values, 
           color_discrete_sequence=px.colors.sequential.Agsunset )
fig.show()

In [None]:
# Top 20 Runtime Movies
plt.figure(figsize=(20,12))
df1=df.sort_values("Runtime",ascending=False)[:20]
sns.barplot(x="Runtime",y="Title",data=df1)

In [None]:
# Which platform has the best movies more than others?
disney8=df_disney[df_disney["IMDb"]>=8]
hulu8=df_hulu[df_hulu["IMDb"]>=8]
netflix8=df_netflix[df_netflix["IMDb"]>=8]
prime8=df_prime[df_prime["IMDb"]>=8]

count_disney8=len(disney8)
count_hulu8=len(hulu8)
count_netflix8=len(netflix8)
count_prime8=len(prime8)

names=["Prime", "Hulu", "Netflix", "Disney+"]
values=[count_prime8, count_hulu8, count_netflix8, count_disney8]
fig=px.pie(names=names, values=values )
fig.show()

In [None]:
fig=px.histogram(df, x="Year",width=800, height=400)
fig.show()

In [None]:
# How about correlation between IMDb and Rotten Tomatoes?
df["Rotten Tomatoes"]=df["Rotten Tomatoes"].astype(str)
NewRotten=[]
for i in df["Rotten Tomatoes"]:
    NewRotten.append(i.split("%")[0])
df["New_Rotten"]=NewRotten
df["New_Rotten"].fillna(0,inplace=True)

df["New_Rotten"].replace("nan","0",inplace=True)
df["New_Rotten"]=df["New_Rotten"].astype(int)
df['New_Rotten'] = (df['New_Rotten']/10)


fig=px.scatter(df,x="IMDb",y="New_Rotten")
fig.show()

**Although nan values make it difficult to comment, we can say that the corelation between IMDb and Rotten Tomatoes is lower than expected**

In [None]:
# Top 20 IMDb Movies and Rotten Tomatoes Movies
df1=df.sort_values("IMDb",ascending=False)[:15]
df2=df.sort_values("New_Rotten",ascending=False)[:15]

fig, axes=plt.subplots(1,2, figsize=(30,15))
sns.barplot(ax=axes[0], data=df1, x="IMDb", y="Title")
sns.barplot(ax=axes[1], data=df2, x="New_Rotten", y="Title")

In [None]:
# What genre is the most common?
df1=df.groupby("Genres")["Title"].count().reset_index()
df1=df1.sort_values("Title",ascending=False)[:15]

fig=px.bar(df1, x="Genres", y="Title",color="Title")
fig.show()

In [None]:
# Best Directors

directors=["Christopher Nolan","Anthony Russo,Joe Russo","David Fincher",
          "Quentin Tarantino","Martin Scorsese","Steven Spielberg",
          "Alfred Hitchcock","Stanley Kubrick","Lana Wachowski,Lilly Wachowski",
          "Taika Waititi","Wes Anderson"]

df_directors1=df.loc[df["Directors"]=="Christopher Nolan"]
for i in directors:
    df1=df.loc[df["Directors"]==i]
    df_directors1=pd.concat([df1,df_directors1])

df.drop_duplicates(keep=False, inplace=True)

a=df_directors1.groupby("Directors")["IMDb","New_Rotten"].mean().reset_index()
a.head()

In [None]:
fig=px.bar(a, x="IMDb", y="Directors",color="IMDb")
fig.show()

In [None]:
fig=px.bar(a, x="New_Rotten", y="Directors",color="New_Rotten")
fig.show()