In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Movies in OTT Platforms

Movies are considered to be an important art forms, a world wide source of entertainment, and a powerful medium for educating or indoctrinating citizens. As far as the current pandemic situation is concerned, OTT platforms act as one of the most entertaining factors and a significant stress reliever for people around the globe.

This project aims to explore all the movies in popular OTT platforms, inorder to gain interesting insights. This is carried out with the aid of a Kaggle dataset, collected from Netflix,PrimeVideo,Hulu and Disney+ API.

Dataset contains the complete information of all the movies,their ratings and the corresponding OTT platforms in which they are available. It provides detailed information such as Year of release, Genre, IMDb rating, Director and the Language of each movie.

## Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading the Dataset

In [None]:
movies=pd.read_csv('/kaggle/input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv')

In [None]:
movies.head()

### Check the size of the DataFrame

In [None]:
movies.shape

# Data Description

In [None]:
movies.describe()

In [None]:
movies.info()

# Data Cleaning

### Dropping duplicate records

In [None]:
movies.drop_duplicates(inplace=True)

### Dropping columns with no information

In [None]:
movies.drop(columns=['Unnamed: 0','ID','Type','Rotten Tomatoes'],inplace=True)

- the Type column is only having value 0 which does not contain any valuable information for EDA process.
- Rotten Tomatoes column contains almost 70% of NULL values. So, inputing it with values may change the actual meaning of the data. Hence, the column is dropped.

### Handling Null values

checking the total null values in each column.

In [None]:
movies.isnull().sum()

- Directors column has 726 NULL values as the above data includes non-fictional shows. Hence, those records cannot be filled with other values.

In [None]:
sns.distplot(movies['IMDb'])
print('Mean:',movies['IMDb'].mean(),'Median:',movies['IMDb'].median())

- the above figure shows that the IMDb column is normally distributed with mean and median almost equal which shows that there are no outliers in the dataset.
- Hence we are filling the IMDb column with the median value.

In [None]:
movies['IMDb'].fillna(movies['IMDb'].median(),inplace=True)

We are going to delete records where more than 8 columns are null values which may not be used for any kind of analysis.

In [None]:
movies.dropna(axis=0,thresh=8,inplace=True)

### Conversion of object column to integer to get some useful insights from age column

In [None]:
movies['Age'].value_counts()

In [None]:
movies['Age'].replace({'18+':18},inplace=True)
movies['Age'].replace({'7+':7},inplace=True)
movies['Age'].replace({'13+':13},inplace=True)
movies['Age'].replace({'all':0},inplace=True)
movies['Age'].replace({'16+':16},inplace=True)

In [None]:
movies.head()

# Visualization

### movie count across each platform

In [None]:
sns.barplot(x=['Netflix','Hulu','Prime Video','Disney+'],y=[movies['Netflix'].value_counts()[1],
                movies['Hulu'].value_counts()[1],movies['Prime Video'].value_counts()[1],
                movies['Disney+'].value_counts()[1]])

- Most of the movies in our data set are from prime video platform.
- followed by Netflix,Hulu and Disney+.

## Movies Distribution across Years

In [None]:
print('Movies released from the year %d are available in the above OTT platforms '%(movies['Year'].min()))
print('Many movies released in the year %d are available in the above OTT platforms '%(movies['Year'].median()))


In [None]:
sns.distplot(movies['Year'])
plt.axvline(movies['Year'].min(),color='green',label='min')
plt.axvline(movies['Year'].max(),color='red',label='max')
plt.axvline(movies['Year'].median(),color='yellow',label='median')
plt.legend()

- Based on the above distribution it is clearly visible that we could find a lot of movies released between the years 2005 and 2020 in above OTT platforms.


In [None]:
print(movies['Year'].value_counts().head())

- Many movies in the OTT platforms are found in the year 2015,2016,2017,2018 which may be seen as the evolution years of these OTT platforms and probably many users started using these platforms during these years which have influenced them to get more movies.

## Visualization based on Genres

Genre column consists of various genres which are comma seperated. To derive useful information from it, we are converting it to a column of individual genres and storing it in another dataframe movies_Genre.

In [None]:
movies_Genre=movies.copy()
del movies_Genre['Genres']
s = movies['Genres'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'Genres'
movies_Genre = movies_Genre.join(s)

In [None]:
movies_Genre.head()

### count of movies based on  Genres across all OTT Platform

In [None]:
lis=[]
for i in movies_Genre['Genres']:
    lis.append(i)

for k in range(0,len(lis)):
    lis[k]=str(lis[k]).strip()
    
from collections import Counter
genre_count = Counter(lis)

from wordcloud import WordCloud
wc = WordCloud(background_color='white')
wc.generate_from_frequencies(genre_count)
plt.figure(figsize=(12,8))
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

- All the OTT Platforms serve variety of Genres staring from Drama to news. 
- Out of all the Genres the Drama,Comedy,Thriller and Action forms the majority across all the Platforms.
- This shows that the OTT platdforms are more interested to buy these genres which typliaclly shows the choice of Genres among the audiences.

### Count of movies based on Genres across each OTT Platform

- Let's deeply visualize the genres spread across each platform one by one.

In [None]:
fig,axes=plt.subplots(1,2,figsize=(15,5))
movies_Genre.loc[movies_Genre['Netflix']==1,'Genres'].value_counts().plot(kind='bar',ax=axes[0],color='orangered')
movies_Genre.loc[movies_Genre['Hulu']==1,'Genres'].value_counts().plot(kind='bar',ax=axes[1],color='forestgreen')
axes[0].set_title('Netflix Genres')
axes[1].set_title('Hulu Genres')
plt.show()

In [None]:
fig,axes=plt.subplots(1,2,figsize=(15,5))
movies_Genre.loc[movies_Genre['Prime Video']==1,'Genres'].value_counts().plot(kind='bar',ax=axes[0],color='indigo')
movies_Genre.loc[movies_Genre['Disney+']==1,'Genres'].value_counts().plot(kind='bar',ax=axes[1],color='crimson')
axes[0].set_title('Prime Video Genres')
axes[1].set_title('Disney+ Genres')
plt.show()

- Drama has been one of the genres which is widely found across all the platforms with more number of movies which may be to attract the family crowd.
- All the genres are equally spread and found on each OTT platforms based on the above analysis.

### Average ratings of movies based on Genres across each OTT Platform

In [None]:
x=movies_Genre.loc[movies_Genre['Netflix']==1].groupby('Genres')['IMDb'].mean().reset_index().sort_values(by='IMDb',ascending=False)
y=movies_Genre.loc[movies_Genre['Hulu']==1].groupby('Genres')['IMDb'].mean().reset_index().sort_values(by='IMDb',ascending=False)
z=movies_Genre.loc[movies_Genre['Prime Video']==1].groupby('Genres')['IMDb'].mean().reset_index().sort_values(by='IMDb',ascending=False)
a=movies_Genre.loc[movies_Genre['Disney+']==1].groupby('Genres')['IMDb'].mean().reset_index().sort_values(by='IMDb',ascending=False)

In [None]:
fig,axes=plt.subplots(1,2,figsize=(15,5))
sns.barplot(x['Genres'],x['IMDb'],ax=axes[0])
axes[0].set_xticklabels(labels=x['Genres'],rotation=90)
axes[0].set_title('Average IMDb ratings for each Genre in Netflix platform')
sns.barplot(y['Genres'],y['IMDb'],ax=axes[1])
axes[1].set_xticklabels(labels=y['Genres'],rotation=90)
axes[1].set_title('Average IMDb ratings for each Genre in Hulu platform')
plt.show()

In [None]:
fig,axes=plt.subplots(1,2,figsize=(15,5))
sns.barplot(z['Genres'],z['IMDb'],ax=axes[0])
axes[0].set_xticklabels(labels=z['Genres'],rotation=90)
axes[0].set_title('Average IMDb ratings for each Genre in Prime Video platform')
sns.barplot(a['Genres'],a['IMDb'],ax=axes[1])
axes[1].set_xticklabels(labels=a['Genres'],rotation=90)
axes[1].set_title('Average IMDb ratings for each Genre in Disney+ platform')
plt.show()

- there is no huge difference in the average ratings for each genre across platforms which ranges between 6 and 7.5.

## Visualization based on Languages

- Language column consists of various Languages which are comma seperated. To derive useful information from it, we are converting it to a column of individual language and storing it in another dataframe movies_language.

In [None]:
movies_language=movies.copy()
t=movies_language['Language'].str.split(',').apply(pd.Series,1).stack()
t.index=t.index.droplevel(-1)
del movies_language['Language']
t.name='Language'
movies_language=movies_language.join(t)

In [None]:
movies_language.head()

### Top 20 languages across all the platforms

In [None]:
top_20=movies_language['Language'].value_counts().head(20).reset_index()
top_20

In [None]:
sns.barplot(top_20['index'],top_20['Language'])
plt.xticks(rotation=90)
plt.xlabel('Languages')
plt.ylabel('movies count')
plt.title('Movie count of top 20 languages')
plt.show()

- It is clearly visible that most of the movies are from English which shows that all the platforms are trying to get viewership across the globe.
- Indian regional languages like Hindi,Tamil,telugu,Punjabi is among the top 20 which shows that all the OTT platforms has a good reach and viewership in India.
- a wide range of European language can be seen in the top 20 which again shows the popularity among the European nations.

### Average IMDb ratings for the top 20 languages

In [None]:
timdb=pd.pivot_table(movies_language,index=['Language'],values='IMDb',aggfunc='median')

In [None]:
timdb.loc[['English','Spanish','French','Hindi','German','Italian','Mandarin','Japanese','Russian','Arabic','Korean',
          'Cantonese','Tamil','Telugu','Portuguese','Turkish','Hebrew','Swedish','Filipino','Punjabi']].sort_values('IMDb',ascending=False).plot(kind='bar',color='brown')
plt.show()

- Even in the case of languages there is not a distinct difference in IMDb ratings which is between 6-7. 

### Top 5 languages based on movie count across the OTT Platforms

In [None]:
top_5_netflix=movies_language.loc[movies_language['Netflix']==1,'Language'].value_counts().head(5).reset_index()
top_5_prime=movies_language.loc[movies_language['Prime Video']==1,'Language'].value_counts().head(5).reset_index()
top_5_hulu=movies_language.loc[movies_language['Hulu']==1,'Language'].value_counts().head(5).reset_index()
top_5_disney=movies_language.loc[movies_language['Disney+']==1,'Language'].value_counts().head(5).reset_index()

In [None]:
fig,axes=plt.subplots(2,2,figsize=(15,5))
sns.barplot(top_5_netflix.loc[1:,'index'],top_5_netflix.loc[1:,'Language'],ax=axes[0,0])
sns.barplot(top_5_prime.loc[1:,'index'],top_5_prime.loc[1:,'Language'],ax=axes[0,1])
sns.barplot(top_5_hulu.loc[1:,'index'],top_5_hulu.loc[1:,'Language'],ax=axes[1,0])
sns.barplot(top_5_disney.loc[1:,'index'],top_5_disney.loc[1:,'Language'],ax=axes[1,1])
axes[0,0].set_title('top 5 languages based on movie count in netflix')
axes[0,1].set_title('top 5 languages based on movie count in Prime Video')
axes[1,0].set_title('top 5 languages based on movie count in Hulu')
axes[1,1].set_title('top 5 languages based on movie count in Disney+')
fig.tight_layout(pad=3.0)
plt.show()

- The above visualization depicts the top 5 languages(other than English) having the highest movie count in different OTT platforms.
- As seen earlier all OTT Platforms are having higher number of English language movies.
- If you want to watch hindi movies you can go with Netflix and Prime Video with more than 400 movies.
- Spanish language movies are equally spread and having more count across all the OTT platforms.
- German movies can be viewed in netflix with more than 100 movies.

## Analysis based on Age

In [None]:
sns.countplot(movies['Age'])
plt.show()

- 0 indicates no restriction in the age group in the above dataset
- it is clearly visible that most of the shows in the OTT platforms are targeted for age group greater than 18.
- Followed by age group of 7 and 13 which may include movies of kid genre.

## Analysis based on Directors

- Directors column consists of various Director names comma seperated. To derive useful information from it, we are converting it to a column of individual directors and storing it in new dataframe movie_director.

In [None]:
movie_director=movies_Genre.copy()
s=movie_director['Directors'].str.split(',').apply(pd.Series).stack()
del movie_director['Directors']
s.index=s.index.droplevel(-1)
s.name='director'
movie_director=movie_director.join(s)

In [None]:
movie_director.drop_duplicates(inplace=True)

In [None]:
movie_director.head()

## Top 10 movie directors

### Based on movie count

In [None]:
movie_director['director'].value_counts().head(10)

In [None]:
movie_director_10=movies.copy()
s=movie_director_10['Directors'].str.split(',').apply(pd.Series).stack()
del movie_director_10['Directors']
s.index=s.index.droplevel(-1)
s.name='director'
movie_director_10=movie_director_10.join(s)

In [None]:
movie_director_10['director'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Directors based on movie count across all OTT platforms')
plt.show()

- A wide variety of about 35 movies directed by Jay Chapman can be viewed across all OTT platform.
- Followed by Joseph kane,Cheh Chang who directed about 20 movies.

### Genres directed by top 10 directors

In [None]:
s=pd.crosstab(movie_director['Genres'],movie_director['director']).loc[:,['Joseph Kane','Sam Newfield','Jim Wynorski','Cheh Chang',
                                                     'Mark Atkins','William Beaudine','Fred Olen Ray',
                                                     'Paul Hoen','Robert Stevenson','Richard Rich']]

In [None]:
s.plot(kind='bar',stacked=True,figsize=(20,10))
plt.title('Various Genres directed by top 10 directors',fontdict={'fontsize':30})
plt.show()

- Based on the above visualization we can infer that most of the action,comedy,music and western genre movies are directed by Joseph Kane.
- Also, a wide range of action and drama genre movies are directed by Cheh Chang.
- If you want to go with thriller movies, then you'll have a wide availability of movies directed by Fred Olen Ray and Mark Atkins.
- William Beaudine has directed movies in almost all the above mentioned genres.

# Conclusion


This kernel is an attempt to understand the insights of all the movies across popular OTT platforms. I hope it has tried to do justice to it and you liked the notebook. Yet a lot of newer insights still can be gained on further exploration of the data, the kernel will be updated in the future of that.

Being my first attempt at EDA, I would appreciate any suggestions or comments from the community.