In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


As we all know, Let's import the data!

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

Let's explore the data first

In [None]:
df.head(10)

In [None]:
# Let's see how big the data is
df.shape

In [None]:
# Okay, let's clean the data. In other words, let me just keep the columns that I will be using and delete all the unneccesary columns.
# drop show_id column
df.drop(columns = ["show_id"], inplace = True)
df.head()

In [None]:
# Great! I have deleted all the columns (even though it is a column)! Let's divide date_added in more detail into years, month and day.
df[["month_added", "day_added", "year_added", "null1"]] = df["date_added"].str.split(" ", expand = True)


In [None]:
# delete null1 columns and get rid of "," from day_added column
df.drop(columns = ["null1"], inplace = True)
df["day_added"] = df["day_added"].str.replace(',','')
df.head()

In [None]:
# Let's reorder the columns and delete date_added column
df.columns

In [None]:
df = df[['type', 'title', 'country', 'month_added', 'day_added', 'year_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
        'director', 'cast']]

In [None]:
df.head(5)

In [None]:
# let's perform basic EAD to understand the data first. we will be suing plotly

import plotly.graph_objects as go
import math
df_type = pd.DataFrame(df["type"].value_counts())
df_type

fig = go.Figure(data=go.Bar(x = df_type.index, y = df_type["type"]))
fig.show()

"{:.2f}".format(13.949999999999999)

print(f' Netflix has more {float((df_type.loc["Movie"] / df_type.loc["TV Show"]))} times more movie than TV Shows.')
# netflix has more 2.23 times more movies than TV Shows

In [None]:
# EAD for country
df_country = pd.DataFrame(df["country"].value_counts())
df_country.head()

In [None]:
# Let's examine how many countries we have
df_country.shape
# we will be only visualize for top 15 countries
df_country.sort_values(by = ["country"], ascending = False, inplace = True)
df_count15 = df_country.head(15)

In [None]:
from plotly.subplots import make_subplots
# Let's do ead for the df_country
fig = go.Figure(data=go.Bar(x = df_count15.index, y = df_count15["country"]))

fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])

fig.add_trace(go.Bar(x=df_count15.index, y=df_count15["country"]),
              row=1, col=1)

fig.add_trace(go.Scatter(x=df_count15.index, y=df_count15["country"]),
              row=1, col=2)

fig.show()

In [None]:
# How about the rating?
# EAD for country
df_rating = pd.DataFrame(df["rating"].value_counts())
df_rating.head()

fig = go.Figure(data=[go.Pie(labels=df_rating.index, values=df_rating['rating'])])
fig.show()

In [None]:
# Now we want to explore listed_in column
# First of all let's figure out how many unique categories we have
import re
merged_cat = ""
# merge all the column into a string
for i in df["listed_in"]:
    merged_cat += i
    merged_cat += "@"
# split the list with , & and @
merged = re.split(', |&|@',merged_cat)
# get rid of space
merged = [i.strip() for i in merged]
print(f"in total we have {df.shape[0]} tv series and movies and overall it is defined by {len(merged)} which accounts for {len(merged)/ df.shape[0]} per show")

In [None]:
# let's visualize what type of tv show or movie is listed in netflix the most?
list_value = pd.DataFrame(merged).value_counts()
list_value = pd.DataFrame(list_value)
list_value.columns = ["list"]
type(list(list_value.index)[0][0])
a = [i[0] for i in list(list_value.index)]
a
fig = go.Figure(data=go.Bar(x = a, y = list_value["list"]))
fig.show()

In [None]:
# The below is the top 10 netflix show category
#list_value = list_value["list"] / df.shape[0]
list_value.head(10)

In [None]:
# Now we will check the release year
df_release = df[["year_added","type"]]
df_release["dummy"] = 1
df_release
table = pd.pivot_table(df_release, values = "dummy",
                      index = ["year_added"],columns = ["type"],aggfunc=np.sum)
table.fillna(0, inplace = True)

date = np.arange(2008, 2022, 1)

#list(table["year_added"]).index(2008)

# list(table.index).index("2008")
# 12 - 25


table["Movie"][12:26]


fig = make_subplots(rows=1, cols=2, column_widths=[0.5, 0.5])

fig.add_trace(go.Bar(x= date, y=table["Movie"][12:26]),
              row=1, col=1)

fig.add_trace(go.Bar(x=date, y=table["TV Show"][12:26]),
              row=1, col=2)

fig.show()

print("Trace 0 is Movie and Trace 1 is TV Show. We can see that number of TV show is increasing whereas Movie has decrease from 2020")

### Now I have a interesting hypothesis : If movie or tv show is {good}, then Netflix will {add to their website fast}

Let's see my hyphothesis is true!

Here we have two term that we have to define

Good -> Good rating in Rotten Tomato (need to pull out from different data set)
add to their website fast -> added_year - release_year (we can make an additional column)

To Be Continue....

In [None]:
a = df.groupby(by = "year_added").count()["type"]
a = pd.DataFrame(a)
date = list(map(str,np.arange(2016,2022,1)))
added_per = float(a.loc[date].sum() / a.sum()) * 100
print(f' {"{:.2f}".format(added_per)} % of movie and tv shows were added between 2016 - 2021 so we will only examine the movie and tv shows that were released after 2016')


In [None]:
df_newMovie = df[df['release_year'] > 2015]
# make sure to convert all the columns into int
prev_size = df_newMovie.shape[0]
print(df_newMovie.shape[0])
df_newMovie["year_added"] = list(map(str, df_newMovie["year_added"]))
df_newMovie["year_added"].value_counts()
df_newMovie["year_added"] = list(map(str.strip, df_newMovie["year_added"]))
df_newMovie["year_added_len"] = list(map(len, df_newMovie["year_added"]))
df_newMovie = df_newMovie[df_newMovie["year_added_len"]==4]
new_size = (df_newMovie.shape[0])
print(df_newMovie.shape)
print(f' {prev_size - new_size} row has been deleted')
df_newMovie["year_added"] = list(map(int, df_newMovie["year_added"]))
df_newMovie["fast"] = df_newMovie["year_added"] - df_newMovie["release_year"]



Okay we Finally got the "Fast" Columns. Let's see how it looks like

In [None]:
df_newMovie["fast"].value_counts()


In [None]:
df_title = pd.read_csv("../input/titletsv/title.tsv", sep = "/t")
# ../input/netflix-shows
# ../input/titletsv
# ../input/netflix-shows