# Importing Basic Libraries

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Loading and Reading the Dataset

In [None]:
df= pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum() #Looking for null values in the dataset

The "director,cast,country,date_added and rating columns" have missing values

# Handling Missing Values

In the above data:

I choose to drop the 'director' and 'cast' columns completely as they have high volume of missing values and dropping these columns will not effect my visualization.


In [None]:
df.drop("director",inplace=True,axis=1)

In [None]:
df.drop("cast",inplace=True,axis=1)

In [None]:
df.head()

I choose to fill all the missing values in "Country" column with mode(frequency)

In [None]:
country_mode = df["country"].mode()

In [None]:
country_mode

In [None]:
df['country'].replace(np.nan,"United States" ,inplace  = True)

In [None]:
df["country"].isnull().sum()

I choose to drop the rows with missing values in "date_added" and "rating" columns because they are very few.

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

Now our data is cleaned with no missing values

In [None]:
df.info() #looking the datatype of the columns

The Dtype of the column 'date_added' is object, converting it into datetime format.

In [None]:
df["date_added"] = pd.to_datetime(df['date_added'])

df['month_added']=df['date_added'].dt.month
df['month_name_added']=df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year

# Droping the column 'date_added' as we have seperate columns for 'year_added' and 'month_added'

df.drop('date_added',axis=1,inplace=True)

In [None]:
df.info()

In [None]:
df.head()

# Content type on Netflix:

In [None]:
plt.figure(figsize=(10,5))
plt.pie(df['type'].value_counts().sort_values(),labels=df['type'].value_counts().index,explode=[0.05,0],
        autopct='%1.2f%%',colors=['Green','grey'])
plt.show()

So 69.14% content on Netflix is TV show and 30.86% is Movie

In [None]:
df["country"].unique()

# Top-20 countries producing most contents:

Since there are contents that are produced in different countries sp we have to consider those too. So we have to split those rows and get the indivisual country.

In [None]:
from collections import Counter
country_data = df['country']
country_count = pd.Series(dict(Counter(','.join(country_data).replace(' ,',',').replace(
    ', ',',').split(',')))).sort_values(ascending=False)

In [None]:
top20country = country_count.head(20)

In [None]:
top20country

In [None]:
sns.barplot(top20country.index,top20country )

# To Be Continued......
# Happy Learning