**Importing required Libraries**

In [None]:
import pandas as pd
import numpy as np

**Importing Data Set**

In [None]:
dataset_url='https://www.kaggle.com/ruchi798/tv-shows-on-netflix-prime-video-hulu-and-disney'
tvshows_df = pd.read_csv('../input/tv-shows-on-netflix-prime-video-hulu-and-disney/tv_shows.csv')

**Initial Data Set**

In [None]:
tvshows_df

# Data Cleaning

In [None]:
new_df=tvshows_df[tvshows_df.IMDb.notna()]
new_df=new_df[new_df.Age.notna()]
new_df=new_df[new_df['Rotten Tomatoes'].notna()]
new_df.reset_index(inplace=True)
new_df=new_df.drop(columns=['index','Unnamed: 0','type'])

**Converting the numerical columns to decipherable type**

In [None]:
IMDb_rating=np.array(list(new_df.IMDb))
RT_rating=np.array(np.char.strip(np.array(list(new_df["Rotten Tomatoes"])),'%'),dtype='float64')
new_df['Rotten Tomatoes']=RT_rating

**Cleaned Data Frame**

In [None]:
new_df

# Data Analysis

**Inital Analysis**

In [None]:
new_df.describe()

**IMDb Rating Extremes**

In [None]:
new_df[new_df.IMDb==new_df.IMDb.min()]

In [None]:
new_df[new_df.IMDb==new_df.IMDb.max()]

**Unique Age Variables**

In [None]:
new_df.Age.unique()#To determine unique values

**Rotten Tomatoes Rating Extremes**

In [None]:
new_df[new_df['Rotten Tomatoes']==new_df['Rotten Tomatoes'].min()]

In [None]:
new_df[new_df['Rotten Tomatoes']==new_df['Rotten Tomatoes'].max()]

# Data Visualization

**Required Data Visualization Libraries and Settings**

In [None]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

**Release Date Histogram**

In [None]:
fig, ax = plt.subplots(figsize=(12,9))
ax.hist(new_df.Year,bins=6, density=False)
plt.xlabel("Year")
plt.ylabel("Relative Frequency")
plt.title("Year Histogram")

for rect in ax.patches:
    height = rect.get_height()
    ax.annotate(f'{int(height)}', xy=(rect.get_x()+rect.get_width()/2, height),xytext=(0, 5), textcoords='offset points', ha='center', va='bottom') 

plt.show()

**Ratings Comparison Boxplot**

In [None]:
fig,ax = plt.subplots()
bp_data=[IMDb_rating*10,RT_rating]
ax.boxplot(bp_data)
ax.set_xticklabels(["IMDb","Rotten tomatoes"])
plt.ylabel("Relative Ratings(Scale 0-100)")
plt.title("Ratings Boxplot")
plt.show()

**Streaming Service Pie-Chart**

In [None]:
N = new_df['Netflix']
N.head()
Sum= N.sum()
H = new_df['Hulu']
H.head()
Hulu_total= H.sum()
P = new_df['Prime Video']
P.head()
Prime_total= P.sum()
D= new_df['Disney+']
D.head()
Disney_total= D.sum()
show_number =[416,350,212,23]
subs = [('Netflix'),('Hulu'),('Prime Video'),('Disney+')]
plt.pie(show_number,labels=subs)

**Streaming Service and Age Bar-Chart**

In [None]:
df=new_df.copy()
df = df.replace(to_replace ="18+", value = 18) 
df = df.replace(to_replace ="16+", value = 16) 
df = df.replace(to_replace ="7+", value = 7)

Netflix_total=new_df[new_df['Netflix']==1].count().Title
Hulu_total=new_df[new_df['Hulu']==1].count().Title
Prime_Video_total=new_df[new_df['Prime Video']==1].count().Title
Disney_Plus_total=new_df[new_df['Disney+']==1].count().Title

#Netflix
new =df.loc[df["Age"]==18]
new = new.loc[df["Netflix"]==1]
new.sum()
Netflix_18 =new["Netflix"].sum()

new =df.loc[df["Age"]==16]
new = new.loc[df["Netflix"]==1]
new.sum()
Netflix_16 =new["Netflix"].sum()

new =df.loc[df["Age"]==7]
new = new.loc[df["Netflix"]==1]
new.sum()
Netflix_7 =new["Netflix"].sum()

Netflix_all= Netflix_total -(Netflix_16+Netflix_7+Netflix_18)

#HULU 
new =df.loc[df["Age"]==16]
new = new.loc[df["Hulu"]==1]
new.sum()
Hulu_16 =new["Hulu"].sum()

new =df.loc[df["Age"]==18]
new = new.loc[df["Hulu"]==1]
new.sum()
Hulu_18 =new["Hulu"].sum()

new =df.loc[df["Age"]==7]
new = new.loc[df["Hulu"]==1]
new.sum()
Hulu_7 =new["Hulu"].sum()

Hulu_all =Hulu_total -(Hulu_7+Hulu_16+Hulu_18)

#Prime Video

new =df.loc[df["Age"]==16]
new = new.loc[df["Prime Video"]==1]
new.sum()
Prime_16 =new["Prime Video"].sum()

new =df.loc[df["Age"]==18]
new = new.loc[df["Prime Video"]==1]
new.sum()
Prime_18 =new["Prime Video"].sum()

new =df.loc[df["Age"]==7]
new = new.loc[df["Prime Video"]==1]
new.sum()
Prime_7 =new["Prime Video"].sum()

Prime_all =Prime_Video_total-(Prime_7+Prime_16+Prime_18)

#Disney Plus

new =df.loc[df["Age"]==16]
new = new.loc[df["Disney+"]==1]
new.sum()
Disney_16 =new["Disney+"].sum()

new =df.loc[df["Age"]==18]
new = new.loc[df["Disney+"]==1]
new.sum()
Disney_18 =new["Disney+"].sum()

new =df.loc[df["Age"]==7]
new = new.loc[df["Disney+"]==1]
new.sum()
Disney_7 =new["Disney+"].sum()

Disney_all =Disney_Plus_total-(Disney_7+Disney_16+Disney_18)

In [None]:
x = ["Netflix","Hulu","Prime Video","Disney+"]
Eighteen_Plus=[Netflix_18,Hulu_18,Prime_18,Disney_18]
Sixteen_Plus =[Netflix_16,Hulu_16,Prime_16,Disney_16]
Seven_Plus =[Netflix_7,Hulu_7,Prime_7,Disney_7]
All=[Netflix_all,Hulu_all,Prime_all,Disney_all]
b_Seven = list(np.add(Sixteen_Plus,Eighteen_Plus))

plt.bar(x,Eighteen_Plus,0.4,label ="18+")
plt.bar(x,Sixteen_Plus,0.4,bottom = Eighteen_Plus,label ="16+")
plt.bar(x,Seven_Plus,0.4,bottom = b_Seven,label ="7+")
plt.bar(x,All,0.4,label ="All")

plt.xlabel("Subsriptions")
plt.ylabel("Age")
plt.title('Subscriptions by Age')
plt.legend()
plt.show()

# Normalization

In [None]:
#Z-Score Computation Function
def z_score(df):
    df_std = df.copy()
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
    return df_std

**Normalized Data Frame**

In [None]:
new_df_numeric = new_df[["Year","IMDb","Rotten Tomatoes"]]
df_normalized = z_score(new_df_numeric)

**Year Normal Distribution**

In [None]:
sns.distplot(df_normalized.Year)

In [None]:
print("Normalized Curve For Year:")
print("Mean :",df_normalized.Year.mean())
print("Standard Deviation :",df_normalized.Year.std())

**IMDb Normalized Distribution**

In [None]:
sns.distplot(df_normalized["IMDb"])

In [None]:
print("Normalized Curve For IMDb Rating:")
print("Mean :",df_normalized.IMDb.mean())
print("Standard Deviation :",df_normalized.IMDb.std())

**Rotten Tomatoes Normalized Distribution**

In [None]:
sns.distplot(df_normalized["Rotten Tomatoes"])

In [None]:
print("Normalized Curve For Rotten Tomatoes Rating:")
print("Mean :",df_normalized['Rotten Tomatoes'].mean())
print("Standard Deviation :",df_normalized['Rotten Tomatoes'].std())

# Correlation Analysis

In [None]:
new_df.corr()

Correlation with Outliers

In [None]:
subset_df=new_df[['IMDb','Rotten Tomatoes']]
subset_df.corr()
sns.regplot(x="IMDb", y="Rotten Tomatoes", data=subset_df);

**IMDb rating and Rotten Tomatoes rating Correlational Analysis**

In [None]:
IMDb_rating=np.array(new_df.IMDb)
imdb_iqr=np.percentile(IMDb_rating,75)-np.percentile(IMDb_rating,25)
min_IMDb_no_outliers=max(np.percentile(IMDb_rating,25)-1.5*imdb_iqr,IMDb_rating.min())
max_IMDb_no_outliers=min(np.percentile(IMDb_rating,75)+1.5*imdb_iqr,IMDb_rating.max())
print("Minimum IMDb rating without outliers is ",min_IMDb_no_outliers)
print("Maximum IMDb rating without outliers is ",max_IMDb_no_outliers)

In [None]:
RT_rating=np.array(new_df['Rotten Tomatoes'])
rt_iqr=np.percentile(RT_rating,75)-np.percentile(RT_rating,25)
min_RT_no_outliers=max(np.percentile(RT_rating,25)-1.5*rt_iqr,RT_rating.min())
max_RT_no_outliers=min(np.percentile(RT_rating,75)+1.5*rt_iqr,RT_rating.max())
print("Minimum Rotten Tomatoes rating without outliers is ",min_RT_no_outliers)
print("Maximum Rotten Tomatoes rating without outliers is ",max_RT_no_outliers)

**Removing Outliers**

In [None]:
subset_df=new_df[['IMDb','Rotten Tomatoes']]
subset_df=subset_df[subset_df.IMDb>=min_IMDb_no_outliers]
subset_df=subset_df[subset_df.IMDb<=max_IMDb_no_outliers]
subset_df=subset_df[subset_df['Rotten Tomatoes']>=min_RT_no_outliers]
subset_df=subset_df[subset_df['Rotten Tomatoes']<=max_RT_no_outliers]

In [None]:
subset_df

In [None]:
subset_df.mean(),subset_df.std()

In [None]:
"""
plt.scatter(subset_df.IMDb,subset_df['Rotten Tomatoes'])
plt.xlabel('IMDb Rating')
plt.ylabel('Rotten Tomatoes Rating')
"""
sns.regplot(x="IMDb", y="Rotten Tomatoes", data=subset_df);

In [None]:
subset_df.corr()

**IMDb and Year Correlational Analysis**

In [None]:
subset_df=new_df[['Year','IMDb']]

In [None]:
subset_df

In [None]:
subset_df.corr()

**Plot with Outliers**

In [None]:
sns.regplot(x='Year',y='IMDb',data=subset_df)

**Removing Outliers**

In [None]:
year_rating=np.array(new_df['Year'])
year_iqr=np.percentile(year_rating,75)-np.percentile(year_rating,25)
min_year_no_outliers=(int)(max(np.percentile(year_rating,25)-1.5*year_iqr,year_rating.min()))
max_year_no_outliers=min(np.percentile(year_rating,75)+1.5*year_iqr,year_rating.max())
print("Minimum Year without outliers is ",min_year_no_outliers)
print("Maximum Year without outliers is ",max_year_no_outliers)

In [None]:
subset_df=new_df[['Year','IMDb']]
subset_df=subset_df[subset_df['Year']>=min_year_no_outliers]
subset_df=subset_df[subset_df['Year']<=max_year_no_outliers]
subset_df=subset_df[subset_df.IMDb>=min_IMDb_no_outliers]
subset_df=subset_df[subset_df.IMDb<=max_IMDb_no_outliers]

In [None]:
subset_df

In [None]:
subset_df.mean(),subset_df.std()

In [None]:
subset_df.corr()

**Plot without Outliers**

In [None]:
sns.regplot(x='Year',y='IMDb',data=subset_df)