In [1]:
#importing libraries

import pandas as pd
import numpy as np


import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [7]:
df = pd.read_csv('NetflixOriginals.csv', encoding='latin1')

In [8]:
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [9]:
df.shape

(584, 6)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


In [11]:
df.describe()

Unnamed: 0,Runtime,IMDB Score
count,584.0,584.0
mean,93.577055,6.271747
std,27.761683,0.979256
min,4.0,2.5
25%,86.0,5.7
50%,97.0,6.35
75%,108.0,7.0
max,209.0,9.0


In [12]:
df.isnull().sum()

Title         0
Genre         0
Premiere      0
Runtime       0
IMDB Score    0
Language      0
dtype: int64

In [13]:
df['date'] = pd.to_datetime(df['Premiere'])
df['date']

0     2019-08-05
1     2020-08-21
2     2019-12-26
3     2018-01-19
4     2020-10-30
         ...    
579   2018-12-31
580   2015-10-09
581   2018-12-16
582   2020-12-08
583   2020-10-04
Name: date, Length: 584, dtype: datetime64[ns]

In [14]:
df['year_month']= df['date'].dt.strftime('%Y-%m')
df['year'] = df['date'].dt.year
df['month']= df['date'].dt.month
df['day_of_week']=df['date'].dt.dayofweek

In [15]:
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,date,year_month,year,month,day_of_week
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese,2019-08-05,2019-08,2019,8,0
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish,2020-08-21,2020-08,2020,8,4
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian,2019-12-26,2019-12,2019,12,3
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English,2018-01-19,2018-01,2018,1,4
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi,2020-10-30,2020-10,2020,10,4


### Analysis part

In [18]:
df['Genre'].nunique()

115

In [20]:
df['Genre'].value_counts()

Documentary              159
Drama                     77
Comedy                    49
Romantic comedy           39
Thriller                  33
                        ... 
Comedy horror              1
Supernatural drama         1
Teen comedy horror         1
Romantic comedy-drama      1
Animation / Comedy         1
Name: Genre, Length: 115, dtype: int64

In [21]:
#looking into most used 20 Genres

genre = df['Genre'].value_counts()[:20]
genre

Documentary               159
Drama                      77
Comedy                     49
Romantic comedy            39
Thriller                   33
Comedy-drama               14
Crime drama                11
Biopic                      9
Horror                      9
Action                      7
Concert Film                6
Aftershow / Interview       6
Romance                     6
Action comedy               5
Animation                   5
Romantic drama              5
Science fiction             4
Psychological thriller      4
Variety show                4
Animation / Short           4
Name: Genre, dtype: int64

In [22]:

fig = px.bar(genre, x= genre.index, y=genre.values, labels={'y':'Number of Movies from the Genre', 'index':'Genres'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [24]:
df['Language'].nunique()

38

In [25]:
top_10_languages_used= df['Language'].value_counts()[:10]
top_10_languages_used

English       401
Hindi          33
Spanish        31
French         20
Italian        14
Portuguese     12
Indonesian      9
Korean          6
Japanese        6
German          5
Name: Language, dtype: int64

In [27]:
fig = px.bar(top_10_languages_used, x= top_10_languages_used.index, y=top_10_languages_used.values, labels={'y':'Count', 'index':'Language'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [28]:
#runtime

df['Runtime'].describe()

count    584.000000
mean      93.577055
std       27.761683
min        4.000000
25%       86.000000
50%       97.000000
75%      108.000000
max      209.000000
Name: Runtime, dtype: float64

In [29]:

fig = px.histogram(df, x= 'Runtime', title='Runtime of the Programs in Netflix')

fig.show()

In [31]:
#imdb score

df['IMDB Score'].describe()

count    584.000000
mean       6.271747
std        0.979256
min        2.500000
25%        5.700000
50%        6.350000
75%        7.000000
max        9.000000
Name: IMDB Score, dtype: float64

In [32]:
fig = px.histogram(df, x= 'IMDB Score', title='IMDB Score of the Programs in Netflix')

fig.show()

In [35]:
#cor btwn IMDB ratings and runtime

df[['IMDB Score','Runtime']].corr()

Unnamed: 0,IMDB Score,Runtime
IMDB Score,1.0,-0.040896
Runtime,-0.040896,1.0


In [36]:
fig = px.scatter(df, x='IMDB Score', y='Runtime')
fig.show()

### Year

In [37]:
Year = df['year'].value_counts()
Year

2020    183
2019    125
2018     99
2021     71
2017     66
2016     30
2015      9
2014      1
Name: year, dtype: int64

In [38]:
fig = px.bar(Year, x= Year.index, y=Year.values, labels={'y':'Count of Movies in Each Year', 'index':'Year'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

### Month

In [39]:
Month= df['month'].value_counts(sort=False)
Month

1     37
2     39
3     48
4     63
5     53
6     35
7     34
8     37
9     53
10    77
11    57
12    51
Name: month, dtype: int64

In [40]:
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

fig = px.bar(Month, x= months, y=Month.values, labels={'y':'Count of Movies in Each Month', 'x':'Month'})
fig.show()

### day of the week

In [41]:
days= df['day_of_week'].value_counts(sort=False)
days

0     17
1     29
2     82
3     59
4    383
5      5
6      9
Name: day_of_week, dtype: int64

In [42]:
day = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']

fig = px.bar(days, x= day, y=days.values, labels={'y':'Count of Movies in Each Day', 'x':'Day'})
fig.show()

## Top 10 Rated Genres

In [43]:
top_10_ratings_by_genre = df.groupby('Genre')['IMDB Score'].mean().sort_values(ascending=False)[:10]
top_10_ratings_by_genre

Genre
Animation/Christmas/Comedy/Adventure    8.200000
Musical / Short                         7.700000
Concert Film                            7.633333
Anthology/Dark comedy                   7.600000
Animation / Science Fiction             7.500000
Making-of                               7.450000
Action-adventure                        7.300000
Drama-Comedy                            7.200000
Historical drama                        7.200000
Coming-of-age comedy-drama              7.200000
Name: IMDB Score, dtype: float64

In [44]:
fig = px.bar(top_10_ratings_by_genre, x= top_10_ratings_by_genre.index, y=top_10_ratings_by_genre.values, labels={'y':'Average Rating Score', 'x':'Genre'})
fig.show()

In [45]:
#lowers top 10 gernes

bottom_10_ratings_by_genre = df.groupby('Genre')['IMDB Score'].mean().sort_values()[:10]
bottom_10_ratings_by_genre

Genre
Heist film/Thriller        3.700000
Musical/Western/Fantasy    3.900000
Horror anthology           4.300000
Political thriller         4.300000
Superhero-Comedy           4.400000
Science fiction/Drama      4.533333
Romance drama              4.600000
Mystery                    4.650000
Horror thriller            4.700000
Anime / Short              4.700000
Name: IMDB Score, dtype: float64

In [46]:
fig = px.bar(bottom_10_ratings_by_genre, x= bottom_10_ratings_by_genre.index, y=bottom_10_ratings_by_genre.values, labels={'y':'Average Rating Score', 'x':'Genre'})
fig.show()

In [47]:
# Top 20 highest rated movies

top_20 = df[['IMDB Score','Title','Genre','year','Language']].sort_values(['IMDB Score'], ascending=False)[:20]
top_20

Unnamed: 0,IMDB Score,Title,Genre,year,Language
583,9.0,David Attenborough: A Life on Our Planet,Documentary,2020,English
582,8.6,Emicida: AmarElo - It's All For Yesterday,Documentary,2020,Portuguese
581,8.5,Springsteen on Broadway,One-man show,2018,English
580,8.4,Winter on Fire: Ukraine's Fight for Freedom,Documentary,2015,English/Ukranian/Russian
579,8.4,Taylor Swift: Reputation Stadium Tour,Concert Film,2018,English
578,8.4,Ben Platt: Live from Radio City Music Hall,Concert Film,2020,English
577,8.3,Dancing with the Birds,Documentary,2019,English
576,8.3,Cuba and the Cameraman,Documentary,2017,English
573,8.2,Klaus,Animation/Christmas/Comedy/Adventure,2019,English
571,8.2,13th,Documentary,2016,English


In [48]:
fig = px.scatter(top_20, y= 'Title', x='IMDB Score', 
                 hover_data = top_20[['Genre','year','Language']], color='Genre', 
                 title = "Top 20 High Rated Programs")
fig.show()

16 out of 20 came from documentary genre

In [49]:
# lowest rated movies

bottom_20 = df[['IMDB Score','Title','Genre','year','Language']].sort_values(['IMDB Score'])[:20]
bottom_20

Unnamed: 0,IMDB Score,Title,Genre,year,Language
0,2.5,Enter the Anime,Documentary,2019,English/Japanese
1,2.6,Dark Forces,Thriller,2020,Spanish
2,2.6,The App,Science fiction/Drama,2019,Italian
3,3.2,The Open House,Horror thriller,2018,English
4,3.4,Kaali Khuhi,Mystery,2020,Hindi
5,3.5,Drive,Action,2019,Hindi
6,3.7,Leyla Everlasting,Comedy,2020,Turkish
7,3.7,The Last Days of American Crime,Heist film/Thriller,2020,English
8,3.9,Paradox,Musical/Western/Fantasy,2018,English
9,4.1,Sardar Ka Grandson,Comedy,2021,Hindi


In [50]:
fig = px.scatter(bottom_20, y= 'Title', x='IMDB Score', 
                 hover_data = bottom_20[['Genre','year','Language']], color='Genre', 
                 title = "20 Lowest Rated Programs")
fig.show()