# Analysis of Netflix Movies and TV Shows

# Import Statements:

In [None]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors
from collections import Counter
from plotly.subplots import make_subplots

# Reading and Understanding the Dataset:

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df['rating'].unique()

In [None]:
df.columns

# Cleaning and Optimizing the Dataset:

In [None]:
df = df[df['date_added'].notna()]
df[df['date_added'].isna()]

In [None]:
df = df[df['rating'].notna()]

In [None]:
df.isna().sum()

In [None]:
df.head(1)

In [None]:
df.shape

In [None]:
df['genre'] = df['listed_in'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 
print(df['genre'].head())
df['principal_genre'] = df['genre'].apply(lambda genres: genres[0])
df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)

In [None]:
df["year_added"] = pd.DatetimeIndex(df["date_added"]).year
df

In [None]:
movie_df = df[df['type'] == 'Movie']
show_df = df[df['type'] == 'TV Show']

In [None]:
movie_df = movie_df.drop(['season_count'], axis=1)
show_df = show_df.drop(["duration"], axis = 1)

In [None]:
movie_df['duration'] = movie_df['duration'].astype(int)
show_df['season_count'] = show_df['season_count'].astype(int)

In [None]:
movie_df.head()

In [None]:
show_df.head()

In [None]:
df.head()

# Visualization: 

# 1. Percentage of each type of content on Netflix (Movie/TV Show)

In [None]:
x = df['type'].value_counts().reset_index()
go.Figure(data=[go.Pie(values = x.type, labels = ["Movies", "TV Shows"], pull=[0, 0.2])],
                layout=go.Layout(title="DISTRIBUTION OF THE TYPE OF CONTENT ON NETFLIX", template = "plotly_dark"))

#### • There are 5372 Movies and 2389 TV Shows available on Netflix across the world.
#### • Around 70% of the content on Netflix is Movies.
# 

# 2. Percentage of each content rating on Netflix

In [None]:
px.pie(df['rating'].value_counts().reset_index(), values='rating', names='index', title='DISTRIBUTION OF CONTENT RATINGS ON NETFLIX', template = "plotly_dark")

#### • The largest count of content is made with the 'TV-MA' rating (2861) "TV-MA": For mature audiences only.
#### • Second largest is the 'TV-14' rating (1931) "TV-14": May be inappropriate for children younger than 14 years of age.
#### • Third largest is the 'TV-PG' rating (806) "TV-PG": Parental guidance suggested
#### • Fourth largest is the very popular 'R' rating (665) "R": May be unsuitable for children under the age of 17 (Under 17 requires accompanying parent or adult guardian")
# 

# 3. Variation of number of Movies and TV Shows released from 2000

In [None]:
df['Content']=1
data = df.pivot_table(index=["type","release_year"],values="Content",aggfunc="sum",fill_value=0)
data = data.reset_index()
fig = px.bar(data[data['release_year']>2000],x="release_year",y="Content",color="type",text="Content",title = "VARIATION OF NUMBER OF MOVIES AND TV SHOWS RELEASED FROM 2000",template = "plotly_dark")
fig.update_layout(barmode='group')
fig.update_traces(textposition='outside')
fig.show()

#### • The entertainment industry mostly favored movies till the last decade, but things are changing, and the focus is shifting towards TV Shows as well. 
#### • We can see that Covid-19 affected the entertainment industry significantly.
#### • Since the past decade, both movies and TV shows gained popularity and most of the content is in the category of Movies.
#### • The highest number of Movies released on Netflix was in 2017. (742 Movies)
#### • In 2020, we can see the significant increase in the number of TV Shows taking over the number of Movies. 
#### • We can also observe that this follows a right skewed distribution over the years. 
# 

# 4. Variation of content added over time from 2008

In [None]:
t2 = df.groupby(df['year_added'])['type'].agg('describe')
t2['movies'] = t2.freq
t2.drop(columns=['top', 'freq', 'unique'], inplace=True)
t2['tv_shows'] = t2['count'] - t2.movies
t2 = t2.drop(2021)
fig = go.Figure()
fig.add_scatter(x=t2.index, 
                y=t2.tv_shows, 
                fill='tonexty',
                name='TV Shows',
                line_color='rgb(103,0,31)'
               )
fig.add_scatter(x=t2.index, 
                y=t2.movies, 
                fill='tonexty',
                name='Movies',
                line_color='rgb(178,24,43)'
               )
fig.add_scatter(x=t2.index, 
                y=t2['count'], 
                line_color='white', 
                line_dash='dash',
                opacity=.5,
                name='Total',
               )
fig.update_traces(mode='lines')
fig.update_layout(title_text='CONTENT ADDED OVER TIME',
                  xaxis_title='Year',
                  yaxis_title='New Content added',
                  hovermode="x unified",template = "plotly_dark")

fig.show()
# t2

#### • We can observe that there are more Movies than TV Shows being added on Netflix.
#### • General trend is that there is new content being added every year.
#### • In 2020, the numbers drop, probably because Covid-19 made it harder to produce new content. 
# 

# 5. Distribution of genres of Movies released every year from 2015

In [None]:
top_movies_genres = [
    'International Movies',
    'Dramas',
    'Comedies',
    'Documentaries',
    'Action & Adventure',
]
year_genre_df = movie_df[(movie_df['principal_genre'].isin(top_movies_genres)) & (movie_df['release_year'] >= 2015)].groupby(['principal_genre', 'release_year']).agg({'title': 'count'})
year_genre_df = year_genre_df.reset_index()
year_genre_df.columns = ['principal_genre', 'release_year', 'count']

fig = px.sunburst(year_genre_df, path=['release_year', 'principal_genre'], values='count', title = "DISTRIBUTION OF MOVIES RELEASED EACH YEAR BASED ON GENRE FROM 2015",template = "plotly_dark")
fig.show()
# year_genre_df

#### • We can observe that from 2015-2019, the major genres and their count increases slightly, but the most popular/most viewed genre remains the same. (Dramas)
#### • More and more content is being produced every year until 2019. After that, Covid-19 took place and there was an overall decrease in the content produced.
# 

# 6. Top 10 Countries with the most content on Netflix

In [None]:
temp = df[df["country"]!="Not mentioned"]
px.scatter(temp['country'].value_counts().reset_index().head(10), x="index", y="country",color = "index", size = "country",width = 900, height = 450, title = "TOP 10 COUNTRIES WITH MOST CONTENT",template = "plotly_dark", size_max = 100)

#### • Unsurprisingly, the United States stands out with a total of 2555 contents on Netflix. It’s obvious as Netflix is an American company. 
#### • India ranks second with 923 contents, followed by the UK with 397 contents.
# 

# 7. Duration Distribution of the Movies by Minutes

In [None]:
x1 = movie_df['duration']
fig = ff.create_distplot([x1], ['Movie Duration'], bin_size=0.7, curve_type='normal')
fig.update_layout(title_text='DURATION DISTRIBUTION OF MOVIES', template = "plotly_dark")
fig.show()

#### • If we observe carefully, we can see that the duration of movies is normally distributed.
#### • Most of the movies released are around 100 minutes long.
# 

# 8. Target Audience Distribution based on contents released in each Country

In [None]:
target_audience = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}
df['target_audience'] = df['rating'].replace(target_audience)
df['count'] = 1
data = df.groupby('country')[['country','count']].sum().sort_values(by='count',ascending=False).reset_index()[:10]
data = data['country']

country_rating = df.loc[df['country'].isin(data)]
country_rating = pd.crosstab(country_rating['country'],country_rating['target_audience'],normalize = "index").T

country_order = ['United States', 'India', 'United Kingdom', 'Canada', 'Japan', 'France', 'South Korea', 'Spain',
       'Mexico']

age_order = ['Kids','Older Kids','Teens','Adults']
plt.style.use('dark_background')
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#564d4d','#831010','#db0000'])
fig, ax = plt.subplots(1, 1, figsize=(12, 12))

sns.heatmap(country_rating.loc[age_order,country_order],cmap=cmap,square=True, linewidth=2.5,cbar=False,
            annot=True,fmt='1.0%',vmax=.6,vmin=0.05,ax=ax,annot_kws={"fontsize":12})

ax.set_ylabel('')    
ax.set_xlabel('')
fig.text(0.13,0.75,"TARGET AUDIENCE DISTRIBUTION BASED ON CONTENTS PER COUNTRY", fontfamily='Arial',fontsize=18)
# df['target_audience']

#### • Interestingly most of the content in India is targeted to Teens.
#### • Spain and Mexico are the countries with most of the content for Adults.
#### • Adults play a major role as the target audience in almost all the major Countries. 
# 

# 9. Top Actors/Actresses Country wise with the most content

In [None]:
def country_trace(country, flag = "movie"):
    df["from_us"] = df['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
    small = df[df["from_us"] == 1]
    if flag == "movie":
        small = small[small["duration"] != ""]
    else:
        small = small[small["season_count"] != ""]
    cast = ", ".join(small['cast'].fillna("")).split(", ")
    tags = Counter(cast).most_common(25)
    tags = [_ for _ in tags if "" != _[0]]

    labels, values = [_[0]+"  " for _ in tags], [_[1] for _ in tags]
    trace = go.Bar(y=labels[::-1], x=values[::-1], orientation="h", name="")
    return trace

traces = []
titles = ["United States", "","India","", "United Kingdom", "Canada","", "Spain","", "Japan"]
for title in titles:
    if title != "":
        traces.append(country_trace(title))

fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)
fig.add_trace(traces[2], 1,5)
fig.add_trace(traces[3], 2,1)
fig.add_trace(traces[4], 2,3)
fig.add_trace(traces[5], 2,5)

fig.update_layout(height=1200, showlegend=False, template = "plotly_dark", title = "TOP ACTORS/ACTRESSES COUNTRY WISE WITH THE MOST CONTENT")
fig.show()
# df["country"].head(40)

#### • From the above plot, we can infer that, United States, India, United Kingdom, Canada, Spain, and Japan are the top Countries with the most content on Netflix, as we have seen this earlier in Plot 6. Page 9 of 15
#### • Thus, we see that famous Actors like Adam Sandler, Anupam Kher, John Cleese have contributed more to their respective film industries.
# 

# 10. Top 20 Directors with most content Worldwide

In [None]:
px.scatter(df['director'].value_counts().reset_index().head(20), x="index", y="director", size = "director", color = "director",title = "TOP 20 DIRECTORS WITH MOST CONTENT WORLDWIDE", template = "plotly_dark", size_max = 50)

#### • We can observe that only few directors have made more than 10 Movies/TV Shows. 
#### • Raul Campos, Jan Suter made the most content (18) followed by Marcus Raboy (16).
# 

# 11. Representation of TV Shows from 2015

In [None]:
n = show_df.dropna()
n = n[n["release_year"] >= 2015]
fig = px.treemap(n, path=["release_year",'director', "title", "cast"],
                 color='director',
                 hover_data=['director', 'title'],
                 color_continuous_scale='Purple',template = "plotly_dark", title = "TREEMAP REPRESENTATION OF TV SHOWS RELEASED FROM 2015")
fig.show()

#### • Over the years, from 2015, we can observe that TV Shows started gaining more and more popularity. 
#### •	There is a significant increase in the number of TV Shows released each year from 2015.
#### •	TV Shows may even take over Movies in around 5 years. 
# 

# 12. Variation of year released, and year added of Movies and TV Shows from 2005

In [None]:
released_year_df = df.loc[df['release_year'] > 2005].groupby(['release_year', 'type']).agg({'show_id': 'count'}).reset_index()
added_year_df = df.loc[df['year_added'] > 2005].groupby(['year_added', 'type']).agg({'show_id': 'count'}).reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter( 
    x=released_year_df.loc[released_year_df['type'] == 'Movie']['release_year'], 
    y=released_year_df.loc[released_year_df['type'] == 'Movie']['show_id'],
    mode='lines+markers',
    name='Movie: Year Released',
    marker_color='lightblue'
))
fig.add_trace(go.Scatter( 
    x=released_year_df.loc[released_year_df['type'] == 'TV Show']['release_year'], 
    y=released_year_df.loc[released_year_df['type'] == 'TV Show']['show_id'],
    mode='lines+markers',
    name='TV Show: Year Released',
    marker_color='blue'
))
fig.add_trace(go.Scatter( 
    x=added_year_df.loc[added_year_df['type'] == 'Movie']['year_added'], 
    y=added_year_df.loc[added_year_df['type'] == 'Movie']['show_id'],
    mode='lines+markers',
    name='Movie: Year Added',
    marker_color='orange'
))
fig.add_trace(go.Scatter( 
    x=added_year_df.loc[added_year_df['type'] == 'TV Show']['year_added'], 
    y=added_year_df.loc[added_year_df['type'] == 'TV Show']['show_id'],
    mode='lines+markers',
    name='TV Show: Year Added',
    marker_color='red'
))
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(template = "plotly_dark", title = "VARIATION OF YEAR RELEASED AND YEAR ADDED OF MOVIES AND TV SHOWS FROM 2005")
fig.show()

#### •	Looking at the above plots it is obvious that since last decade, content addition to Netflix is far higher than released. 
#### •	This is may be because streaming platforms are adding old content as well in parallel to current production. 
#### •	This Trend is true since the boon of internet era as mostly of the content reach one or another steaming platform. 
#### •	From our plot, impact of recent covid-19 pandemic on content release and content addition can be observed, which is from end 2019 to 2021. 
#### •	Highest number of movies and shows were added around 2020 and lowest of lowest could be seen at the tail section.
# 

# 13. Top 10 TV Shows

In [None]:
fig = px.bar(show_df.sort_values(by = "season_count", ascending = False).head(10),x = "season_count",y = "title", color = "season_count", title = "TOP 10 TV SHOWS",template = "plotly_dark",text="season_count")
fig.update_traces(textposition="outside")

#### •	As a TV Show gains more popularity and more viewers, more seasons are produced. 
#### •	Based on this observation, we can see from the above plot that Grey’s Anatomy is the best TV Show on Netflix having 16 Seasons, followed by Supernatural and NCIS which has 15 Seasons.
# 

# 14.	Content growth throughout History

In [None]:
plt.figure(figsize=(14, 7))
show_progress = show_df['release_year'].value_counts().sort_index()
movie_progress = movie_df['release_year'].value_counts().sort_index()
plt.style.use('dark_background')
plt.plot(show_progress.index, show_progress.values, label='TV shows')
plt.plot(movie_progress.index, movie_progress.values, label='Movie')
plt.axvline(2019, alpha=0.3, linestyle='--', color='w')
plt.axvline(2021, alpha=0.3, linestyle='--', color='w')
plt.axvspan(2019, 2021, alpha=0.2, color='w', label='Coronavirus')
plt.xticks(list(range(1925, 2026, 5)), fontsize=12)
plt.title('CONTENT GROWTH THROUGHOUT HISTORY', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Amount of content', fontsize=14)
plt.yticks(fontsize=12)
plt.legend()
plt.show()

#### •	As we can see, from 2005-2019, the content on Netflix grew rapidly.
#### •	But from 2019-2021, there’s a major drop, due to Covid-19. 
# 

# 15.	Top 20 Shortest and Longest Movies on Netflix

In [None]:
temp1 = movie_df.sort_values(by = "duration").head(20)
temp2 = movie_df.sort_values(by = "duration", ascending = False).head(20)
fig = make_subplots(rows = 2 , cols = 1)
fig.add_trace(
    go.Bar(name = "Top 20 Shortest Movies",x=temp1.title,y=temp1.duration),row = 1 , col = 1)
fig.add_trace(
    go.Bar(name = "Top 20 Longest Movies", x=temp2.title,y=temp2.duration),row = 2 , col = 1)
fig.update_layout(template = "plotly_dark",height=900, width=1000, title = "TOP 20 SHORTEST AND LONGEST MOVIES ON NETFLIX")

#### •	We can observe that Netflix has a wide range of Movies starting with Movies with a duration of just 3 minutes to Movies with a duration of up to 312 minutes.
#### •	The Shortest Movie on Netflix is called “Silent” which has a duration of 3 minutes and the Longest Movie on Netflix is called “Black Mirror: Bandersnatch” which has a duration of 312 minutes. 
# 

# 16.	Median of Movie Duration of Content available on Netflix

In [None]:
grp = df.groupby('type')
movie_df = grp.get_group('Movie')
movie_df['duration'] = [int(i.split(' ')[0]) for i in movie_df.duration.dropna()]
fig = px.violin(movie_df, x='duration', box=True, points="all", labels={'duration':'Duration (in mins)'}, title = "VIOLIN AND BOX-PLOT REPRESENTATION OF MOVIE DURATION", template = "plotly_dark")
fig.show()

#### •	The median movie length is 98 mins, 50% of the movies have a duration between 86 mins and 114 mins. 
#### •	Shortest movie available on Netflix is 3 mins long whereas the longest movie available on Netflix is 312 mins long.
#### •	We can also see that the movie duration is normally distributed.
# 

# 17.	Geographic representation of Content Released in each Country

In [None]:
iso = pd.read_csv('../input/country-codes-and-coordinates/countries_codes_and_coordinates.csv')[['Country','Alpha-3 code']]
df_map = pd.DataFrame()
x = np.hstack([np.array(i.split(',')) for i in df.country.dropna()])
unique, counts = np.unique(x, return_counts=True)        
df_map['Country'] = unique
df_map['count'] = counts
df_map = df_map.merge(iso, how='left', on='Country').dropna()
df_map['Alpha-3 code'] = df_map['Alpha-3 code'].apply(lambda x:x[2:-1])

fig = go.Figure(data=go.Choropleth(locations=df_map['Alpha-3 code'],
                                    z=df_map['count'].astype(float),
                                    colorscale='bluered',
                                    text=df_map['Country'],
                                    marker_line_color='white',
                                    colorbar_title = 'Content count'))

fig.update_geos(projection_type="orthographic")
fig.update_layout(template = "plotly_dark", title = "GEOGRAPHIC REPRESENTATION OF CONTENT RELEASED IN DIFFERENT COUNTRIES")                
fig.show()

#### •	This interactive plot shows the Countries with Countries with different amounts content on Netflix. 
#### •	This Orthographic representation gives a complete picture of the World along with the number of contents released on Netflix in each Country. 
#### •	We can clearly see that USA has most content on Netflix.  
# 