# Importing libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline

# Importing dataset

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv", index_col = 'show_id')
df.head()

# Some extra information about the dataset

In [None]:
df.shape

In [None]:
df.info()

#### So most of the columns are of string datatype.

In [None]:
df.describe(include=[np.object])

### So at this point we have following information about the data.
#### 1. Most titles added were movies.
#### 2. Most of the movies/shows were directed by Raul Campos and Jan Suter.
#### 3. Most common actor is David Attenborough.
#### 4. Most shows are from USA.
#### 5. Most shows/movies were added on  1 Jan,2020.
#### 6. Most shows/movies are of TV-MA rating.
#### 7. Most shows have one seeson.
#### 8. Most shows are of documentries

# Checking for missing values

In [None]:
msno.bar(df)

#### So as we can see, director field has the most number of missing values followed by cast.
#### Some dates of when the movies/show was added is also missing.

# Checking for duplicate values

In [None]:
dup = df.duplicated()
df[dup].sum()

#### So there are no duplicate values. 

# Dealing with missing values

#### Lets remove rows with missing date_added values.

In [None]:
df.dropna(subset = ['date_added'], inplace = True)

In [None]:
df.head()

# Exploratory data analysis

#### So lets check which type of show or movies are more on netflix?

In [None]:
result = df['type'].value_counts()
result

In [None]:
plt.style.use("fivethirtyeight")
plt.figure(figsize = (5,5))
plt.bar(result.index, result.values)
plt.ylabel("No. of count")
plt.show()

#### Most of the titles are movies.

#### Which director has directed the most shows/movies? 

In [None]:
result = df['director'].value_counts()[0:10].sort_values()
result

In [None]:
plt.style.use("seaborn")
plt.figure(figsize = (10,5))
plt.barh(result.index[:10],result.values[:10])
plt.title("Most titles by a director")
plt.xticks(list(range(0,19)))
plt.show()

#### Raul Campos, Jan Suter have directed the most number shows/movies.

#### Which actor has acted in most shows/movies on netflix?

In [None]:
result = df['cast'].str.split(',').explode().value_counts()[0:10].sort_values()
result

In [None]:
plt.style.use("seaborn")
plt.figure(figsize = (10,5))
plt.barh(result.index[:10],result.values[:10])
plt.title("Most common actor")
plt.show()

#### Anupam Kher has acted in most of the shows/movies.

#### Which country has most number of shows/movies on netflix?

In [None]:
result = df['country'].value_counts()
result = result[0:10].sort_values()
result

In [None]:
plt.style.use("seaborn")
plt.figure(figsize = (5,5))
plt.barh(result.index[:10],result.values[:10])
plt.title("Countries with most shows on Netflix")
plt.show()

#### Most shows/movies are from USA.

#### Rating

In [None]:
result = df['rating'].value_counts().sort_values()
result

In [None]:
plt.style.use("seaborn-pastel")
plt.figure(figsize = (5,5))
plt.barh(result.index,result.values)
plt.title("Ratings")
plt.show()

#### Most shows are of TV-MA rating.

#### Show duration

In [None]:
result = df['duration'].value_counts()[0:4]
result

In [None]:
plt.style.use("seaborn-pastel")
plt.pie(result.values, labels = result.index, startangle = 90, autopct = '%1.1f%%', shadow=True)
plt.show()

#### Most tv shows are of 1 season.

### Tags

In [None]:
result = df['listed_in'].str.split(',').explode().value_counts()[0:10].sort_values()
result

In [None]:
plt.style.use("seaborn")
plt.figure(figsize = (5,5))
plt.barh(result.index[:10],result.values[:10])
plt.show()

#### Most shows are tagged as International movies.