My blog on the same is available at - https://jpchii.github.io/2021/03/16/netflixdataanalysis.html

In [None]:
# importing the data analysis libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Analaysing the columns of the dataset

In [None]:
# importing the csv
shows_data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
shows_data

In [None]:
shows_data.columns

In [None]:
# dataypes
shows_data.dtypes

In [None]:
# index range
shows_data.index

In [None]:
shows_data.info()

In [None]:
shows_data.head()

In [None]:
shows_data["type"].unique()

In [None]:
shows_data["country"].str.contains("India")

In [None]:
shows_data["release_year"].unique()

## Detailed description on the dataset columns

* show_id - A unique id assigned to every available tv show or movie
* type - What type of content is it - `TV Show or Movie`
* title - Title of the content
* director - director of the content
* cast - cast(actors) of the content
* country - Country where the movie / show was produced
* date_added - Date on content added to netflix
* rating
* duration - Duration in minutes or seasons
* listed_in - Genre
* description - short description about the movie/show

### Analysis based on release year

In [None]:
np.arange(1940, 1980 + 1, 1)

In [None]:
# Splitting release year data in half for better vizualization
ry_40_80 = shows_data[shows_data["release_year"].isin(np.arange(1940, 1980 + 1 , 1))]
ry_81_20 = shows_data[shows_data["release_year"].isin(np.arange(1981, 2020 + 1 , 1))]
ry48 = ry_40_80["release_year"]
ry82 = ry_81_20["release_year"]

In [None]:
# Content analysis based on release date
ry_40_80 = shows_data["release_year"]
bins=np.arange(min(ry48), max(ry48) + 1, 1)
fig,ax = plt.subplots(figsize=(30,10))
ax.hist(ry_40_80, bins=bins, edgecolor='black', color='yellow') # bins to spread the data year wise
ax.set_xticks(bins) # This will set the year exact with each bar
plt.xticks(rotation=90) # rotating the year vertically in y axis
ax.set(xlabel="Year of Release",
      ylabel="Number of Shows/Movies")
ax.set_title("1940 to 1980", fontsize=20)
plt.show()

In [None]:
# Content analysis based on release date
ry_81_20 = shows_data["release_year"]
bins=np.arange(min(ry82), max(ry82) + 1, 1)
fig,ax = plt.subplots(figsize=(30,10))
ax.hist(ry_40_80, bins=bins, edgecolor='black', color='yellow')
ax.set_xticks(bins)
ax.set(xlabel="Year of Release",
      ylabel="Number of Shows/Movies")
ax.set_title("1981 to 2020", fontsize=20)
plt.xticks(rotation=90)
plt.show()

## Analysis of number of `country` feature with `year_released`

In [None]:
ry = pd.Series(shows_data["release_year"])
cu = pd.Series(shows_data["country"])
year_country_data = pd.DataFrame({"release_year":ry, "country":cu})
year_country_data.head()

In [None]:
year_country_data["country"].fillna("notavailable", inplace=True)

In [None]:
year_country_data.tail()

In [None]:
# Converting object to byte string
year_country_data["country"] = year_country_data["country"].astype('|S')
year_country_data.dtypes

In [None]:
year_country_data.head() # byte strings

In [None]:
# Using decode to remove the b'' from converted byte string
year_country_data["country"] = year_country_data["country"].str.decode('utf-8')

In [None]:
year_country_data.tail()

In [None]:
year_country_data[year_country_data["country"] == "notavailable"]

**Note:** *Out of 7784 shows/movies 507 is not associated with any country*

In [None]:
# Removing Content associated with no countries
year_country_data = year_country_data[year_country_data["country"] != "notavailable"]
year_country_data.info()
year_country_data.to_csv('ry-country-removed-missing.csv')

In [None]:
# Adding country count as additional column using lamba function
year_country_data["No of Countries"] = year_country_data["country"].apply(lambda x: len(x.split(",")))
year_country_data.head()

In [None]:
# Splitting the data by midpoint of release year
ycd_40_80 = year_country_data[year_country_data["release_year"].isin(np.arange(1940, 1980, 1))]
ycd_81_20 = year_country_data[year_country_data["release_year"].isin(np.arange(1981, 2020, 1))]

In [None]:
# Confirming splitted data
ycd_40_80.info()
ycd_81_20.info()

In [None]:
# Plotting 1940 to 1980 split data with country count
fig,(ax0, ax1) = plt.subplots(nrows=2, ncols=1 ,figsize=(30,20), sharey=True)
ax0.bar(ycd_40_80["release_year"], ycd_40_80["No of Countries"])
ax1.bar(ycd_81_20["release_year"], ycd_81_20["No of Countries"])
plt.xlabel("Year of Release", fontsize=20)
ax0.set_ylabel("No of Countries", fontsize=20)
ax1.set_ylabel("No of Countries", fontsize=20)
ax0.set_title("Year of Release v No of Countries", fontsize=30)
plt.show()

**Note:** *On seeing the graph the no of countries increased from a peak of 5 in midpoint to peak of 12 in second half*

There's no effect on no of countries with movies release per year, this might be purely due to technology advancement in further years

## Analysis based on the date added to netflix

In [None]:
shows_data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
shows_data.info()

In [None]:
shows_data["date_added"].isna().sum()

In [None]:
# Removing null columns
shows_data = shows_data[pd.notnull(shows_data["date_added"])]
shows_data["date_added"].isna().sum()

In [None]:
# Saving the nan removed csv
shows_data["date_added"].to_csv("date-added-removed-nan.csv")

In [None]:
shows_data.head()

In [None]:
shows_data["date_added"] = shows_data["date_added"].astype("|S")
shows_data.info()

In [None]:
shows_data["date_added"] = shows_data["date_added"].str.decode('utf-8')
shows_data.info()

In [None]:
# Converting months from text to numbers, write a function to save the number of lines
shows_data["date_added"] = shows_data["date_added"].str.replace("December", "12,")
shows_data["date_added"] = shows_data["date_added"].str.replace("November", "11,")
shows_data["date_added"] = shows_data["date_added"].str.replace("October", "10,")
shows_data["date_added"] = shows_data["date_added"].str.replace("September", "09,")
shows_data["date_added"] = shows_data["date_added"].str.replace("August", "08,")
shows_data["date_added"] = shows_data["date_added"].str.replace("July", "07,")
shows_data["date_added"] = shows_data["date_added"].str.replace("June", "06,")
shows_data["date_added"] = shows_data["date_added"].str.replace("May", "05,")
shows_data["date_added"] = shows_data["date_added"].str.replace("April", "04,")
shows_data["date_added"] = shows_data["date_added"].str.replace("March", "03,")
shows_data["date_added"] = shows_data["date_added"].str.replace("February", "02,")
shows_data["date_added"] = shows_data["date_added"].str.replace("January", "01,")
shows_data["date_added"].head()

In [None]:
shows_data["date_added"]

In [None]:
shows_data["date_added"] = shows_data["date_added"].str.replace(", ", "/")
shows_data.head()

In [None]:
# converted date added string to datetime
shows_data["date_added"] = pd.to_datetime(shows_data["date_added"], format="%m/%d/%Y", errors='coerce')

In [None]:
shows_data["day_added"]= shows_data["date_added"].dt.day
shows_data["month_added"]= shows_data["date_added"].dt.month
shows_data["year_added"]= shows_data["date_added"].dt.year
shows_data.head()

In [None]:
shows_data = shows_data[pd.notnull(shows_data["day_added"])]
shows_data["day_added"].isna().sum()

In [None]:
shows_data["day_added"] = shows_data["day_added"].astype('int64')
shows_data["month_added"] = shows_data["month_added"].astype('int64')
shows_data["year_added"] = shows_data["year_added"].astype('int64')

In [None]:
shows_data["year_added"].hist()

In [None]:
shows_data["year_added"].describe()

In [None]:
# Plotting the year added hist with a midpoint similar to year released feature
yd = shows_data["year_added"]
bins = np.arange(np.min(yd), np.max(yd) + 1, 1)
fig, ax = plt.subplots(figsize=(30,10), edgecolor='black')
ax.hist(shows_data["year_added"], bins=bins)
ax.set_xticks(bins)
ax.set_xlabel("Year added", fontsize=20)
ax.set_ylabel("No of movies and shows", fontsize=20)
ax.set_title("No of movies/shows with respect to year added", fontsize=30)
plt.style.use("seaborn-whitegrid")
plt.show()

In [None]:
shows_data["year_added"].unique(), shows_data["month_added"].unique()

In [None]:
bins = np.arange(np.min(shows_data["month_added"]), np.max(shows_data["month_added"]) + 1, 1)
fig, ax = plt .subplots(figsize=(30,20))
ax.hist(shows_data["month_added"], bins=bins)
ax.set_xticks(bins)
ax.set_xlabel("Month", fontsize=20)
ax.set_ylabel("Number of Movies/Shows added", fontsize=20)
ax.set_title("Monthwise distrubtion of content addition across years", fontsize=30)
plt.show();

In [None]:
shows_data["month_added"].describe()

In [None]:
shows_data.info()

## Type of content added analysis with respect to year added

In [None]:
shows_data["type"].unique()

In [None]:
# Converting type feature to numerical feature 0 - TV show, 1 - movie
shows_data["type"] = shows_data["type"].astype('|S')
shows_data["type"] = shows_data["type"].str.decode('utf-8')
shows_data["type"] = shows_data["type"].str.replace("TV Show", "0")
shows_data["type"] = shows_data["type"].str.replace("Movie", "1")

In [None]:
shows_data.head()
shows_data.to_csv("type-numerical-feature.csv")

In [None]:
shows_data["year_added"].unique()

In [None]:
shows_data_type = pd.Series(shows_data["type"])
shows_data_yd = pd.Series(shows_data["year_added"])
shows_data_type_year = pd.DataFrame({"year_added": shows_data_yd, "type": shows_data_type})

In [None]:
shows_data_type_year.groupby(['year_added', 'type']).size().plot(kind="bar", 
          figsize=(30,20), 
          xlabel = "Year Added", 
          ylabel = "Tv/Show(0)-Movie(1)",
          title="Content classified distribution",
          fontsize=20);

## Analysis of number of `country`  and `No of Movies/Tv shows` feature with `year_added`

In [None]:
year_country_data.info()

In [None]:
yd_noc = pd.DataFrame({"year_added": shows_data["year_added"], 
                       "No Of countries": year_country_data["No of Countries"],
                       "type": shows_data_type_year["type"],
                       "title": shows_data["title"]
                      })
yd_noc.head()

In [None]:
yd_noc.isna().sum()

In [None]:
# Removing nan values
yd_noc = yd_noc[pd.notnull(yd_noc["year_added"])]

In [None]:
# Filling no of countries nan values
yd_noc["No Of countries"] = yd_noc["No Of countries"].fillna(0)

In [None]:
yd_noc["year_added"] = yd_noc["year_added"].astype('int64')
yd_noc["No Of countries"] = yd_noc["No Of countries"].astype('int64')

In [None]:
yd_noc.info()

In [None]:
yd_noc.head()

In [None]:
yd_noc.groupby(['year_added', 'No Of countries']).count().plot(kind="bar",
                                                              figsize=(30,20))

In [None]:
yd_noc_count = yd_noc.groupby(['year_added', 'No Of countries'])["type"].count().reset_index(name="count")

In [None]:
yd_noc_count.head()

In [None]:
fig, ax = plt.subplots(figsize=(30,20))
scatter = ax.scatter(
        x = yd_noc_count["No Of countries"],
        y = yd_noc_count["count"],
        c = yd_noc_count["year_added"],
        cmap='summer',
        s=20**2)

# Setup a mean line
ax.axhline(y=yd_noc_count["count"].mean(), 
            color='b', 
            linestyle='--', 
            label="Average")

# Setting xticks
noc = yd_noc_count["No Of countries"]
plt.xticks(np.arange(np.min(noc), np.max(noc) + 1, 1))

# Setting yticks
count = yd_noc_count["count"]
inc = np.std(yd_noc_count["count"])
plt.yticks(np.arange(np.min(count), np.max(count) + inc, inc))

# zip joins x and y coordinates in pairs
for x,y in zip(noc,count):

    #label = f"({x},{y})"
    label_unf = yd_noc_count[(yd_noc_count["No Of countries"] == x) & (yd_noc_count["count"] == y)]
    label_year = label_unf["year_added"].values
    label = f"({label_year})"
    #label = f"({label})"

    #arrowprops=dict(arrowstyle='<-', color='blue', linewidth=1, mutation_scale=150)
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') 

# Setting legend
ax.legend(*scatter.legend_elements(), title="Year", loc=0, fontsize="xx-large")

# Setting labels and title
ax.set_xlabel("No of Countries", fontsize=20)
ax.set_xlabel("No of Movies/TV shows", fontsize=20)
ax.set_title("No of Countries v No of Movies/TV shows", fontsize=25)

plt.show()

In [None]:
# Splitting the data for better plotting
yd_noc_count_1 = yd_noc_count[yd_noc_count["count"] < 677]
yd_noc_count_2 = yd_noc_count[yd_noc_count["count"] > 677]

# Setting up the fig
fig, ax = plt.subplots(figsize=(30,20))
scatter = ax.scatter(
        x = yd_noc_count_1["No Of countries"],
        y = yd_noc_count_1["count"],
        c = yd_noc_count_1["year_added"],
        cmap='summer',
        s=20**2)

# Setup a mean line
ax.axhline(y=yd_noc_count_1["count"].mean(), 
            color='b', 
            linestyle='--', 
            label="Average")

# Setting legend
ax.legend(*scatter.legend_elements(), title="Year", loc=0, fontsize="xx-large")

# Setting xticks
noc = yd_noc_count_1["No Of countries"]
plt.xticks(np.arange(np.min(noc), np.max(noc) + 1, 1))

# Setting yticks
count = yd_noc_count_1["count"]
#inc = np.std(yd_noc_count["count"]).astype('int64')
plt.yticks(np.arange(np.min(count), np.max(count) + 25, 25))

# zip joins x and y coordinates in pairs
for x,y in zip(noc,count):

    #label = f"({x},{y})"
    label_unf = yd_noc_count_1[(yd_noc_count_1["No Of countries"] == x) & (yd_noc_count_1["count"] == y)]
    label_year = label_unf["year_added"].values
    label = f"({label_year}, {y})"
    #label = f"({label})"

    #arrowprops=dict(arrowstyle='<-', color='blue', linewidth=1, mutation_scale=150)
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='right')

ax.set_xlabel("No of Countries", fontsize=20)
ax.set_xlabel("No of Movies/TV shows", fontsize=20)
ax.set_title("No of Countries v No of Movies/TV shows", fontsize=25)
    
plt.show()

In [None]:
# Splitting the data for better plotting
yd_noc_count_2 = yd_noc_count[yd_noc_count["count"] > 677]

# Setting up the fig
fig, ax = plt.subplots(figsize=(20,10))
scatter = ax.scatter(
        x = yd_noc_count_2["No Of countries"],
        y = yd_noc_count_2["count"],
        c = yd_noc_count_2["year_added"],
        cmap='summer',
        s=20**2)

# Setup a mean line
ax.axhline(y=yd_noc_count_2["count"].mean(), 
            color='b', 
            linestyle='--', 
            label="Average")

# Setting legend
ax.legend(*scatter.legend_elements(), title="Year", loc=0, fontsize="xx-large")

# Setting xticks
noc = yd_noc_count_2["No Of countries"]
plt.xticks(np.arange(np.min(noc), np.max(noc) + 1, 1))

# Setting yticks
count = yd_noc_count_2["count"]
#inc = np.std(yd_noc_count["count"]).astype('int64')
plt.yticks(np.arange(np.min(count), np.max(count) + 250, 250))

# zip joins x and y coordinates in pairs
for x,y in zip(noc,count):

    #label = f"({x},{y})"
    label_unf = yd_noc_count_2[(yd_noc_count_2["No Of countries"] == x) & (yd_noc_count_2["count"] == y)]
    label_year = label_unf["year_added"].values
    label = f"({label_year})"
    #label = f"({label})"

    #arrowprops=dict(arrowstyle='<-', color='blue', linewidth=1, mutation_scale=150)
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='right') 
plt.show()