In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("../input/hackathon1/netflix_titles.csv")
df.head()

In [None]:
Y = df["type"].value_counts()
plt.pie(Y, labels=["Movies","TV Show"])
plt.title("Movies vs TV show ")


In [None]:
Y = df["rating"].value_counts()
plt.figure(figsize=(14,10))
sns.barplot(Y.index, Y.values)
plt.xlabel("Ratings")
plt.ylabel("Number of movies/shows")
plt.title("Comparison of ratings")


In [None]:
Y = df["country"].value_counts()
Y = Y[0:10]
plt.figure(figsize=(14,10))
sns.barplot(Y.index, Y.values)
plt.title("Top 10 countries with maximum productions")
plt.xlabel("Countries")
plt.ylabel("Number of movies/shows")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def relation_heatmap(df, title):
    df['genre'] = df['listed_in'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 
    Types = []
    for i in df['genre']: Types += i
    Types = set(Types)
    print(f"There are {len(Types)} types in the Netflix {title} Dataset")    
    test = df['genre']
    mlb = MultiLabelBinarizer()
    res = pd.DataFrame(mlb.fit_transform(test), columns=mlb.classes_, index=test.index)
    corr = res.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    fig, ax = plt.subplots(figsize=(15, 14))
    pl = sns.heatmap(corr, mask=mask, cmap= "coolwarm", vmax=.5, vmin=-.5, center=0, square=True, linewidths=.7, cbar_kws={"shrink": 0.6})
    plt.title(f"Listed in genre analysis for {title}")
    plt.show()

In [None]:
relation_heatmap(df, 'TV Show')


In [None]:
relation_heatmap(df, 'Movies')

In [None]:
from collections import Counter

years = list(range(2008,2020,1))

movie_rows = df.loc[df["type"] == "Movie"]
tv_rows = df.loc[df["type"] == "TV Show"]

movies_counts = movie_rows.release_year.value_counts()
tv_counts = tv_rows.release_year.value_counts()

index_years_mov = movies_counts.index.isin(years)
index_years_tv = tv_counts.index.isin(years)

movies = movies_counts[index_years_mov]
tv_shows = tv_counts[index_years_tv]

In [None]:
plt.figure(figsize=(14,10))
plt.title("Release of movies and tv shows every year")
sns.lineplot(data=movies, color="b", label="Movies / year")
sns.lineplot(data=tv_shows, color="c", label="TV Shows / year")
plt.show()

In [None]:
casts = ", ".join(df.copy().fillna("")['cast']).split(", ")
counter_list = Counter(casts).most_common(10)
most_common_actors = [i for i in counter_list if i[0] != ""]
labels = [i[0] for i in most_common_actors][::-1]
values = [i[1] for i in most_common_actors][::-1]

most_common_directors = df.director.value_counts().head(5).sort_values(ascending=True)
x = most_common_directors

In [None]:
plt.figure(figsize=(12,8))
plt.title("Top directors")
plt.scatter(x.index,x.values)
plt.show()

In [None]:
plt.figure(figsize=(14,10))
plt.title("Top actors")
plt.scatter(labels,values)
plt.show()