# <span style='color:Blue'> This is my first post on Kaggle, I will appreciate your tips for improvement.  </span>

# TV Shows and Movies listed on Netflix


## This dataset consists of tv shows and movies available on Netflix as of 2021. The dataset is collected from Flixable which is a third-party Netflix search engine.
* show_id = Identification number (número de identificación)
* type = Film category.
* title = The name of film.
* director = The name of person who directs the making of a film.
* cast = The actors in a film.
* country = The place of origin of film.
* date_added = Refers to when the item was added to the directory in question.
* release_year = A fixed date on which a film is due to become available to watch.
* rating = Ratings are not designed that films are appropiate or inappropiate, they are simply a way of saying what kind of content is                                              included in the movie to give parents a chance to make informed decisions on what they are allowing their children to watch.
* duration = Is defined as the length of time that something lasts. When a film lasts for two hours, this is an example.
* listed_in = It is the film genre to which each film belongs.
* description = Is the description of each movie.

## Import Python libraries and dataset

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
data.shape

In [None]:
data.sample(5)

In [None]:
data.nunique()

In [None]:
print(list(data["type"].unique()))
print(list(data["rating"].unique()))

In [None]:
data.isnull().sum()

In [None]:
df=data.fillna('Unknown')
df['director']=df['director'].apply(lambda x: x.split(',')[0])
df['country']=df['country'].apply(lambda x: x.split(',')[0])

df[['duration','time']]=df['duration'].str.split(expand=True)
df=df.drop(df[df['date_added']=='Unknown'].index)
df['date_added']=pd.to_datetime(df['date_added']).dt.year
df.reset_index(drop=True, inplace=True)
df['duration']=df['duration'].astype(str).astype(int)
df.head(5)

In [None]:
df.dtypes

In [None]:
trace0 = go.Box(x=df['type'], y=df['date_added'], boxpoints='all', marker_color='indianred')
fig=[trace0]
iplot(fig)

In [None]:
df_tvshow = df[(df['type']=='TV Show')&(df['date_added']>=2015)]
df_movie = df[(df['type']=='Movie')&(df['date_added']>=2015)]
df_cleaned = pd.concat([df_tvshow, df_movie])
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned.sample(5)

# Directors with the highest number of films

In [None]:
top10_tvshow = df_tvshow[['director']]
top10_tvshow = top10_tvshow['director'].str.split(',').apply(lambda x: x[0])
top10_tvshow = top10_tvshow.value_counts()
top10_movies = df_movie[['director']]
top10_movies = top10_movies['director'].str.split(',').apply(lambda x: x[0])
top10_movies = top10_movies.value_counts()

x1 = list(top10_tvshow.values[1:11])
x2 = list(top10_movies.values[1:11])

y1 = list(top10_tvshow.index[1:11])
y2 = list(top10_movies.index[1:11])

fig = make_subplots(rows=1, cols=2, subplot_titles=("Directors of TV Show", "Directors of Movie"))

fig.add_trace(
    go.Bar(name="Tv Show",y = y1, x = x1,orientation='h' ),
    row = 1, col = 1,
)

fig.add_trace(
    go.Bar(name="Movie",y = y2, x = x2,orientation='h' ),
    row = 1, col = 2,
)

# fig.update_traces(hoverinfo="y+name+x")
fig.update_layout(legend = dict(orientation = "h",
                               yanchor = "bottom",
                                xanchor = "right",
                                y = 1.10,
                                x = 0.1))
fig.show()

# Actors and Actresses with the highest number of films

In [None]:
top_cast_tvshow= df_tvshow['cast'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 
cast_tvshow = []
for i in top_cast_tvshow:
    cast_tvshow += i
cast_tvshow = pd.DataFrame(cast_tvshow, columns=['cast'])
cast_tvshow = cast_tvshow['cast'].value_counts()

x1 = list(cast_tvshow.values[1:11])
y1 = list(cast_tvshow.index[1:11])

top_cast_movie = df_movie['cast'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 
cast_movie = []
for i in top_cast_movie:
    cast_movie += i
cast_movie = pd.DataFrame(cast_movie, columns=['cast'])
cast_movie=cast_movie['cast'].value_counts() 
x2 = list(cast_movie.values[1:11])
y2 = list(cast_movie.index[1:11])

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Bar(name="Tv Show",y = y1, x = x1,orientation='h' ),
    row = 1, col = 1,
)

fig.add_trace(
    go.Bar(name="Movie",y = y2, x = x2,orientation='h' ),
    row = 1, col = 2,
)

fig.update_traces(hoverinfo="y+name+x")
fig.update_layout(title="Actors and Actresses with the highest number of films",
                  title_x=0.5,
                  legend = dict(orientation = "h",
                               yanchor = "bottom",
                                xanchor = "right",
                                y = 1.02,
                                x = 1))
fig.show()

In [None]:
rating_tvshow = df_tvshow [['rating']].value_counts().to_frame()
rating_tvshow = rating_tvshow.rename(columns={0:'counts'}).reset_index(drop=False)
print(rating_tvshow)
print('=='*10)

rating_movie = df_movie[['rating']].value_counts().to_frame()
rating_movie = rating_movie.rename(columns={0:'counts'}).reset_index(drop=False)
print(rating_movie)

In [None]:
print(list(rating_tvshow['rating']))
print(list(rating_movie['rating']))

* TV-MA : This program is intended to be viewed by mature, adult audiences and may be unsuitable for children under 17.
* TV-14 : This program may be unsuitable for children under 14 years of age.
* TV-PG : This program contains material that parents may find unsuitable for younger children. Parental guidance is recommended.
* TV-Y  : This program is aimed at a very young audience, including children from ages 2–6.
* TV-Y7 : This program is most appropriate for children age 7 and up.
* TV-G  : This program is suitable for all ages.
* TV-Y7-FV : (Directed to Older Children - Fantasy Violence) Intended for older children. Contains fantasy violence more combative than TVY7 programs.

* G     : (General Audiences) This program is designed to be appropriate for all ages. 
* PG    : (PArental Guidance Suggested) This film may contain some material parents might not like for their young children.
* PG-13 : (Parents Strongly Cautioned) May contain violence, nudity, sensuality, language, adult activities or other elements beyond a PG rating, but doesn’t reach the             restricted R category.
* R     : (Restricted) This rating is for films specifically designed to be viewed by adults and therefore may be unsuitable for children under 17.
* NC-17 : (Clearly Adult) This rating is applied to films the MPAA believes most parents will consider inappropriate for children 17 and under.

* If an uncut version of a film was submitted to the MPAA, the labels Not Rated (NR) or Unrated (UR) are often used.

In [None]:
label1 = list(rating_tvshow['rating'])
label2 = list(rating_movie['rating'])
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=rating_tvshow['counts'], name="TV SHOW"),
              1, 1)

fig.add_trace(go.Pie(labels=label2, values=rating_movie['counts'], name="MOVIE"),
              1, 2)

fig.update_traces(hole=.5, hoverinfo="label+value+name")

fig.update_layout(
    title="Rating TV Show and Movie",
    title_x=0.5,
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='TV SHOW', x=0.185, y=0.5, font_size=20, showarrow=False),
                 dict(text='MOVIE', x=0.80, y=0.5, font_size=20, showarrow=False)])

fig.show()

## COUNTRIES THAT AGGREGATE HIGH TV SHOWS AND MOVIES BY YEAR

In [None]:
countries_movie=df_movie[["country",'date_added']]
countries_movie=countries_movie[countries_movie['country'].str.contains('United States|India|United Kingdom|Canada|France|Spain|Egypt|Turkey|Mexico|Philippines')]
countries_movie=countries_movie.value_counts().to_frame()
countries_movie=countries_movie.reset_index().rename(columns={0:'counts'}).sort_values(['date_added'])

fig = px.line(countries_movie, x="date_added", y="counts", color='country')
fig.update_layout(title='Growth of Movie by main countries',
                 title_x=0.5,
                 xaxis_title='Year',
                 yaxis_title='Frecuency'
                 )
fig.show()

In [None]:
countries_tvshow=df_tvshow[["country","date_added"]]
countries_tvshow=countries_tvshow[countries_tvshow['country'].str.contains('United States|United Kingdom|South Korea|Japan|India|Taiwan|Canada|France|Australia|Spain')]
countries_tvshow=countries_tvshow.value_counts().to_frame()
countries_tvshow=countries_tvshow.reset_index().rename(columns={0:'counts'}).sort_values(['date_added'])

fig = px.line(countries_tvshow, x="date_added", y="counts", color='country')
fig.update_layout(title='Growth of TV Shows by main countries',
                 title_x=0.5,
                 xaxis_title='Year',
                 yaxis_title='Frecuency'
                 )

fig.show()

# Distribution about the duration of tv shows or movies

In [None]:
hist_data=[df_movie['duration']]

group_labels=['min']
fig=ff.create_distplot(hist_data, group_labels,show_rug=False)
fig.update_layout(title='Distribution of Movie duration',
                  title_x=0.5,
                  showlegend=False)
fig.show()


In [None]:
value=np.arange(1,17,1)
lista =[]
for i in df_tvshow['duration'].sort_values().unique():
    lista.append(i.astype(str) +" season")
lista.insert(13,'14 season')
    
fig = px.histogram(x = df_tvshow['duration'],
                   nbins = 40,
                   labels = {'x':'Seasons'})
fig.update_layout(title = "Distribution of TV Show duration",
                 title_x = 0.5)
fig.update_xaxes(tickvals = value,ticktext = lista,tickangle = 45)
fig.show()

# Growth of TV Show by genre in United States

In [None]:
ts_genre = df_tvshow['listed_in'].apply(lambda x: x.replace(' ,',',').replace(', ',',').split(','))
genre_tvshow = []
for i in ts_genre:
    genre_tvshow += i
# list(pd.DataFrame(genre_tvshow)[0].unique())
genre_tvshow = pd.DataFrame(genre_tvshow).rename(columns={0:'genre'})
genre_tvshow = genre_tvshow.value_counts().to_frame().rename(columns={0:'counts'}).reset_index()
list(genre_tvshow['genre'][:5])

In [None]:
temp = list()
g3_data = df_tvshow
for ind, element in g3_data.iterrows():
    tv_show = element['country']
    tv_added = element['date_added']
    for genre in str(element['listed_in']).replace(', ',',').replace(' ,',',').split(','):
        temp.append([tv_show, tv_added, genre])
g4_data = pd.DataFrame(temp, columns=['country','date_added', 'genre'])
g4_data = g4_data[g4_data['country']=='United States'].value_counts().to_frame().reset_index().rename(columns={0:'counts'})
g4_data.sort_values('date_added', inplace=True)
g4_data =g4_data[g4_data['genre'].str.contains("International TV Shows|TV Dramas|TV Comedies|Crime TV Shows|Kids' TV")]

In [None]:
fig = px.line(g4_data, x="date_added", y="counts", color='genre')
fig.update_layout(title='Growth of TV Show by genre in United States',
                 title_x=0.5,
                 xaxis_title='Year',
                 yaxis_title='Frecuency'
                 )
fig.show()

In [None]:
fig = px.bar(g4_data, x="date_added", y="counts", color='genre')
fig.update_layout(title = "Growth of TV Show by genre in United States",
                 title_x = 0.5)
fig.show()

# Growth of Movie by genre in United States

In [None]:
m_genre = df_movie['listed_in'].apply(lambda x: x.replace(' ,',',').replace(', ',',').split(','))
genre_movie = []
for i in m_genre:
    genre_movie += i
# list(pd.DataFrame(genre_tvshow)[0].unique())
genre_movie = pd.DataFrame(genre_movie).rename(columns={0:'genre'})
genre_movie = genre_movie.value_counts().to_frame().rename(columns={0:'counts'}).reset_index()
list(genre_movie['genre'][:5])

In [None]:
temp1 = list()
g1_data = df_movie
for ind, element in g1_data.iterrows():
    movie_show = element['country']
    movie_added = element['date_added']
    for genre in str(element['listed_in']).replace(', ',',').replace(' ,',',').split(','):
        temp1.append([movie_show, movie_added, genre])
g2_data = pd.DataFrame(temp1, columns=['country','date_added', 'genre'])
g2_data = g2_data[g2_data['country']=='United States'].value_counts().to_frame().reset_index().rename(columns={0:'counts'})
g2_data.sort_values('date_added', inplace=True)
g2_data =g2_data[g2_data['genre'].str.contains('International Movies|Dramas|Comedies|Documentaries|Action & Adventure')]

In [None]:
fig = px.bar(g2_data, x="date_added", y="counts", color='genre')
fig.update_layout(title = "Growth of Movie by genre in United States",
                 title_x = 0.5)
fig.show()