## In this notebook you will find techniques for: 

#### 1. Gathering Data
#### 2. EDA (using visuals) 


### Getting the data ready 

In [None]:
# importing libraries
import os # to work with dirs and paths 
import numpy as np #data manipulation 
import pandas as pd #reading, storing data 

#importing visualization libraries 
import matplotlib.pyplot as plt 
import seaborn as sns # i feel plots from seaborn as more aesthetic 

In [None]:
#getting current working directory -- this is my first time using kaggle inbuilt notebook 
os.getcwd()

In [None]:
# getting all the file names present in the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#reading the data present in the input dir
netflix_data = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv") 

In [None]:
# Viewing the content of the dataframe
display(netflix_data.head(10))

In [None]:
# getting the details about the data such as number of rows, columns, non null values and data type 
netflix_data.info()

#### You will realize there are no null values in 9 of the 12 variables present. For the rest we can either remove the null rows from the analysis based on the null percentage 

In [None]:
# calculating the null percentage for each column

((netflix_data.isnull() | netflix_data.isna()).sum() * 100 / netflix_data.index.size).round(2)

In [None]:
# getting distribution information for numerical columns 
netflix_data.describe()

In [None]:
# getting distribution information for columns with data type object which same as factors in r 
netflix_data.describe(include= np.object)

In [None]:
# getting the unique values present in each column of the data : to understand the level of the data 
netflix_data.nunique()

> > ## EDA

In [None]:
# getting the total count of shows across countries
# there are upto 11 countries stored in a cell value in the dataframe 

netflix_data[['0','1','2','3','4','5','6','7','8','9','10','11']] = netflix_data.country.str.split(',', expand = True)

In [None]:
#Viewing data with the new columns 
display(netflix_data[~netflix_data["10"].isnull()].head(2))

In [None]:
# distribution of shows within tv and movies 
# netflix_data["type"].value_counts().plot(kind = ["bar"])

In [None]:
# the frequency of movies and tv shows on netflix 
type_df = netflix_data.groupby(["type"], as_index = False).agg({'show_id':'count'})
sns.barplot(x = type_df["type"], y = type_df["show_id"])

In [None]:
# getting the trend of movies and tv shows based on their release year 
trend_release_year = netflix_data.groupby(["release_year","type"], as_index = False).agg({'show_id':'count'})

plt.figure(figsize = (20,12))
sns.barplot( x = trend_release_year["release_year"], y = trend_release_year["show_id"], hue = trend_release_year["type"])

In [None]:
# getting the year added to plot over the years the frequency of movies and tv shows on netflix 
netflix_data["year_added"] = pd.to_datetime(netflix_data["date_added"]).dt.year
trend_mov_tv = netflix_data.groupby(["year_added","type"], as_index = False).agg({'show_id':'count'})

In [None]:
plt.figure(figsize = (15,8))
sns.barplot( x = trend_mov_tv["year_added"], y = trend_mov_tv["show_id"], hue = trend_mov_tv["type"], color= "purple")

### You will notice that the number of movies in 2020 compared to 2019 have gone down where as number of tv shows show an upward trend  

In [None]:
# show count across countries  
country_cnt = netflix_data.groupby(["0"], as_index = False).agg({'show_id':'count'})

country_cnt[country_cnt["show_id"] > 2].sort_values("show_id", ascending = False).reset_index(drop = True)

In [None]:
# show count across countries  
country2_cnt = netflix_data.groupby(["1"], as_index = False).agg({'show_id':'count'})

country2_cnt["1"] =country2_cnt["1"].str.lstrip()

country2_cnt[country2_cnt["show_id"] > 2].sort_values("show_id", ascending = False).reset_index(drop = True)

In [None]:
country12 = country_cnt.merge(country2_cnt, left_on = ["0"], right_on = ["1"], how = "outer")
country12

In [None]:
country12.fillna(0, inplace= True)

country12["total_shows_count"] = country12["show_id_x"] + country12["show_id_y"]

In [None]:
country12[country12["total_shows_count"] > 2].sort_values("total_shows_count", ascending= False).reset_index(drop = True)

In [None]:
netflix_data["duration"].unique()