In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno #missing data vizualization library

In [None]:
#LOADING MY DATASET FROM KAGGLE
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
#RAISED QUESTIONS
#1. What is the split between Movies and TV Shows?
#2. Which director has the highest number of movies?
#3. Which country has the highest number of movies?
#4. What`s the distribution of movies per year?
#5. What`s the distribution of TV Shows per year?
#6. What`s the behavior of movie and tv shows duration along the years?
#7. What `s the distribution of Movie genres on NETFLIX?

In [None]:
#DATASET FIRST CHECK - MISSING INFORMATION
msno.matrix(df)

In [None]:
#ANALYZING THE % OF NULL INFORMATION ON EACH COLUMN 

for i in df.columns:
    null_rate = (df[i].isna().sum()/ len(df)) * 100
    
    if null_rate > 0:
        print(f'The column {i} has {null_rate:.2f} % of null data')
    


In [None]:
#TREATING MISSING DATA
df[['director', 'cast', 'country']] = df[['director', 'cast', 'country']].fillna('No_information')
#NO MORE MISSING DATA IN THE DATASET

In [None]:
#UNDERSTANDING THE DATA TYPE OF EACH COLUMN
df.dtypes

In [None]:
#DURATION COLUMN IS MIXING INT AND STR, e.g 90 min. SPLITING THE INT AND STR
df['duration_number'] = df['duration'].str.split(' ').str.get(0)
df['duration_class'] = df['duration'].str.split(' ').str.get(1)

In [None]:
#THE COLUMN DURATION HAS NO USE ANYMORE, DROPPING THE COLUMN
df = df.drop(columns = 'duration')

In [None]:
#CONVERTING COLUMN DURATION_NUMBER TO INTEGER
df['duration_number'] = pd.to_numeric(df['duration_number'])

In [None]:
#CONVERTING CATEGORICAL DATA FROM COLUMN LISTED_IN INTO VARIABLES
df = df.join(df['listed_in'].str.get_dummies(sep=', ')).drop(columns = 'listed_in')
#DOUBLE ATTENTION IN THIS CASE, BECAUSE THE SEPARATOR IS A COMMA + WHITESPACE. IF YOU USE ONLY COMMA YOU WILL GET DUPLICATED COLUMNS AS VARIABLES

In [None]:
#CREATING A MOVIE GENRE LIST TO POSSIBLY USE IN SOME ANALYSIS
movie_genre = ['Action & Adventure', 'Anime Features',
       'Anime Series', 'British TV Shows', 'Children & Family Movies',
       'Classic & Cult TV', 'Classic Movies', 'Comedies', 'Crime TV Shows',
       'Cult Movies', 'Documentaries', 'Docuseries', 'Dramas',
       'Faith & Spirituality', 'Horror Movies', 'Independent Movies',
       'International Movies', 'International TV Shows', "Kids' TV",
       'Korean TV Shows', 'LGBTQ Movies', 'Movies', 'Music & Musicals',
       'Reality TV', 'Romantic Movies', 'Romantic TV Shows',
       'Sci-Fi & Fantasy', 'Science & Nature TV', 'Spanish-Language TV Shows',
       'Sports Movies', 'Stand-Up Comedy', 'Stand-Up Comedy & Talk Shows',
       'TV Action & Adventure', 'TV Comedies', 'TV Dramas', 'TV Horror',
       'TV Mysteries', 'TV Sci-Fi & Fantasy', 'TV Shows', 'TV Thrillers',
       'Teen TV Shows', 'Thrillers']

In [None]:
#THIS COLUMN SHOWS IF A MOVIE OR TV SHOW HAS MORE THAN 1 GENRE
df['genre_sum'] = df[movie_genre].sum(axis = 1)

In [None]:
#Question 1 - The division of Movies and TV Shows

In [None]:
#Answer Q.1.1
movie_count = df['type'].str.count('Movie').sum()
pct_movies = ((movie_count) / len(df))*100
tv_count = df['type'].str.count('TV Show').sum()
pct_tvshow = ((tv_count) / len(df)) * 100
print(f'The total number of enterteinment content on NETFLIX is {len(df)}')
print(f'We have {movie_count} movies on NETFLIX that represents| {pct_movies:.2f} %')
print(f'We have {tv_count} TV Shows on NETFLIX that represents {pct_tvshow:.2f} %')


In [None]:
#Answer Q.1.2
sns.countplot(data = df, x = 'type')
plt.title('Number of Movies and TV Shows in Netflix')
plt.ylabel('Number')
plt.xlabel('Type')
plt.show()

In [None]:
#Q.1.3
plt.figure(facecolor = 'w')
df['type'].value_counts().plot.pie()

#WE CAN UNDERSTAND THE BEHAVIOR OF DATA, BUT NOT THE BEST WAY OF UNDERSTANDING IT

In [None]:
#Q.1.4 - The conclusion for the first question is that the number of TV Shows is less then a half of Movies in NETFLIX

In [None]:
#QUESTION 2 - WICH DIRECTOR HAS THE HIGHEST NUMBER OF MOVIES ON NETFLIX?

In [None]:
#TO BETTER ANSWER AND CLEAN THE DATA, I WILL CREATE A DATASET ONLY WITH MOVIES
movies = df.loc[df['type'] == 'Movie']

In [None]:
#A GLIMPSE OF DIRECTORS DISTRIBUTION
x = movies.director.value_counts()
x = pd.DataFrame(x)
x.reset_index(drop = False, inplace = True)
x.columns = ['Director', 'number_of_movies']

In [None]:
#DROPPING THE ROW THAT CONTAINS NO_INFORMATION
x = x.drop(0)

In [None]:
#FINDING OUT WHO HAS THE HIGHEST NUMBER OF MOVIES
x.sort_values(by = 'number_of_movies', ascending = False).head()

In [None]:
#Q.2.1
print(f'The director with the highest number of movies is {x["Director"][1]}')

In [None]:
#QUESTION 3 - WHAT COUNTRY HAS THE HIGHEST NUMBER OF MOVIES?

In [None]:
#CHANGING CATEGORICAL DATA INTO VARIABLES
movie_country = movies.country.str.get_dummies(sep = ', ')

In [None]:
#CHANGING A PANDAS SERIES INTO DATAFRAME SO WE CAN ANALYSE IT BETTER
m_c = pd.DataFrame(movie_country.sum().sort_values(ascending = False))
m_c.reset_index(drop = False, inplace = True)
m_c.columns = ['Country', 'Total_Movies']

In [None]:
#PLOTING THE TOP 5 COUNTRYS
sns.barplot(data = m_c.iloc[0:5], x = 'Country', y = 'Total_Movies')
plt.title('Top 5 - Movie Maker Countries')
plt.show()

In [None]:
#PLOTING ALL THE OTHER COUNTRIES, EXCLUDING THE TOP 5 AND THE ROW RELATED TO THE NO INFORMATION
plt.figure(figsize = (20,10))
g1 = sns.barplot(data = m_c.iloc[6:33], x = 'Country', y = 'Total_Movies')
g1.set_xticklabels(labels = m_c['Country'][6:33].unique(), rotation = 45, horizontalalignment = 'right')
plt.title('Top 26 - Movie Maker Countries - Excluding TOP 5 and No Information Data')

In [None]:
#Q.3.1 - BY THE PLOT ANALYSIS AND THE DATAFRAME SORTED VALUES WE CONCLUDE THAT THE BIGGEST MOVIE PRODUCER IS UNITED STATES AS EXPECTED BY ITS TRADITION AND 
#TREMENDOUS ENTERTAINMENT MARKET
print(f'Based on the plot analysis and the dataframe sorted by values, we conclude that the biggest movie maker is {m_c["Country"][0]} with a total of {m_c["Total_Movies"][0]} movies')

In [None]:
#QUESTION 4 - WHAT IS THE DISTRIBUTION OF MOVIES PER RELEASE YEAR

In [None]:
plt.figure(figsize = (15,7.5))
ano = sns.countplot(data = movies, x = 'release_year')
ano.set_xticklabels(labels = sorted(movies['release_year'].unique()), rotation = 45, fontsize = 6.5)
plt.title('Distribution of movies per release year')
plt.xlabel('Release Year (1942 - 2020 YTD Sep)')
plt.ylabel('Number Of Movies')
plt.show()


In [None]:
#Q.4.1 WE CAN OBSERVE THAT THE NUMBER OF MOVIES INCREASED EXPONENTIALLY OVER THE YEARS. THE TRIGGER STARTED IN THE 90`S.

In [None]:
#QUESTION 5 - WHAT`S THE DISTRIBUTION OF TV SHOW NUMBER ALONG THE YEARS?

In [None]:
#CREATING A NEW DATAFRAME ONLY FOR TV SHOWS
tv = df.loc[df['type'] == 'TV Show']

In [None]:
plt.figure(figsize = (15,7.5))
y1 = sns.countplot(data = tv , x = 'release_year')
plt.title('Distribution of TV Shows per release year')
plt.xlabel('Release Year (1925 - 2020 YTD Sep)')
plt.ylabel('Number of TV Show')
y1.set_xticklabels(labels = sorted(tv.release_year.unique()), rotation = 45)
plt.show()

In [None]:
#Q.5.1 WE SEE THAT DIFFERENT FROM THE MOVIE DATASET, TV SHOWS STARTED ITS EXPONENCIAL INCREASE AFTER 2000. THE TREND IS THE SAME, 
#BUT THE NUMBERS ARE QUITE DIFFERENT. ALTHOUGH BOTH CASES ARE IN THE ENTERTAINMENT BUSINESS, TV SHOWS ARE LONGER THAN MOVIES.
#LONGEST MOVIES HAS 4-5 HOURS, TV SHOWS WE MEASURE BY SEASONS THAT COULD BE ON TV FOR DECADES, e.g SEINFIELD, FRIENDS, SMALLVILLE.

In [None]:
#QUESTION 6 - What`s the behavior of movie and tv shows duration along the years?

In [None]:
plt.figure(figsize = (15,7.5))
mov_dur = sns.boxplot(data = movies, x = 'release_year', y = 'duration_number')
plt.xticks(fontsize = 8, rotation = 90)
plt.ylim(0,240)
plt.title('Movies duration distribution along the release years')
plt.ylabel('Duration in minutes')
plt.xlabel('Release Year (1925 - 2020 YTD Sep)')
plt.yticks(ticks = (20,40,60,80,100,120,140,160,180,200,220,240))
plt.show()

In [None]:
plt.figure(figsize = (15,7.5))
sns.distplot(movies['duration_number'])
plt.xlabel('Duration in Minutes')
plt.title('Movie duration distribution')
plt.show()

In [None]:
print(f'Movie duration dataset statistics:\nmode: {movies.duration_number.mode()[0]}\nmedian: {movies.duration_number.median()}')

In [None]:
plt.figure(figsize = (15,7.5))
sns.boxplot(data = tv , x = 'release_year', y = 'duration_number')
plt.xlabel('Release Year')
plt.ylabel('Duration in Seasons')
plt.title('TV Show duration distribution along the years')
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (15,7.5))
sns.distplot(tv['duration_number'], kde = True, bins = 10)
plt.xlabel('Duration in Seasons')
plt.title('Duration in Seasons')
plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'TV Show duration dataset statistics:\nmode: {tv.duration_number.mode()[0]}\nmedian: {tv.duration_number.median()}')

In [None]:
#QUESTION 7. What `s the distribution of Movie genres on NETFLIX?

In [None]:
#CREATING A DATASET ONLY WITH MOVIE GENRES
mov_gen = movies[movie_genre]

In [None]:
#REMOVING ALL COLUMNS RELATED TO TV SHOWS
for i in mov_gen.columns:
    if mov_gen[i].sum(axis = 0) == 0:
        mov_gen = mov_gen.drop(columns = i)

In [None]:
plt.figure(figsize = (15,7.5))
mov_gen.sum().sort_values(ascending = True).plot(kind = 'barh', grid = True)
plt.title('Number of movies per genre')
plt.xlabel('Number of movies')
plt.ylabel('Movie genre')
plt.show()