In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Netflix Datenanalyse

Analyse der Film und Seriendaten von Netflix (Quelle: Flixable).

In [None]:
netflix_data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
netflix_data.tail()

In [None]:
netflix_data.shape

In [None]:
netflix_data["title"]

In [None]:
netflix_data[["title", "description"]]

In [None]:
netflix_data[netflix_data["title"]=="La casa de papel"]

In [None]:
netflix_data[netflix_data["type"]=="TV Show"]

In [None]:
netflix_data.loc[netflix_data["title"]=="La casa de papel", "type":]

In [None]:
# Alles aus 2020
netflix_data.loc[netflix_data["release_year"] == 2020, ["title", "description"]]

# Alle Filme aus 2020
netflix_data.loc[(netflix_data["release_year"] == 2020) & (netflix_data["type"] == "Movie"), 
                 ["title", "description"]]

# Alle Filme aus den USA
netflix_data.loc[(netflix_data["release_year"] == 2020) & (netflix_data["country"] == "United States"), 
                 ["title", "description"]]

# Alle Filme die nicht aus den USA kommen
netflix_data.loc[(netflix_data["release_year"] == 2020) & (netflix_data["country"] != "United States"), 
                 ["title", "description"]]

In [None]:
netflix_data.info()

In [None]:
netflix_data.loc[netflix_data["type"]=="TV Show", "duration"]

In [None]:
netflix_data["country"].unique()

* Duration ist ein Objekt (Text) und keine Zahl (int) - bei Serien "Season(s)" bei Filmen (min)
* Länder und Genres (listed_in) sind keine einzelwerte sondern Listen (mehrere Länder, mehrere Genres)
* Titel sind in Originalsprache (Bsp: Haus des Geldes = La casa del papel)
* Fehlende Werte bei Director, cast, country, date_added, rating

In [None]:
import missingno

missingno.matrix(netflix_data)

# Daten vorbereiten

In [None]:
netflix_data["country"].fillna("missing", inplace=True)
netflix_data["director"].fillna("missing", inplace=True)
netflix_data["cast"].fillna("missing", inplace=True)

In [None]:
netflix_data.dropna(subset=["date_added"], inplace=True)
netflix_data.dropna(subset=["rating"], inplace=True)

In [None]:
netflix_data.isna().sum()

In [None]:
netflix_data.shape

# Visualisierungen

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x="type", data=netflix_data)

In [None]:
plt.figure(figsize=(15,8))
netflix_data_2000 = netflix_data[netflix_data["release_year"] >= 2000]
sns.countplot(x="release_year", data=netflix_data_2000)

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x="release_year", data=netflix_data_2000, hue="type")

In [None]:
# rating - 
plt.figure(figsize=(15,8))
sns.countplot(x="rating", data=netflix_data)

In [None]:
netflix_data["rating"].unique()

In [None]:
mapping_audience = {
    "TV-PG": "Older Kids",
    "TV-MA": "Adults",
    "TV-Y7-FV": "Older Kids",
    "TV-Y7": "Older Kids",
    "TV-14": "Teens",
    "R": "Adults",
    "TV-Y": "Kids",
    "NR": "Adults",
    "PG-13": "Teens",
    "TV-G": "Kids",
    "PG": "Older Kids",
    "G": "Kids",
    "UR": "Adults",
    "NC-17": "Adults"
}


netflix_data["target_audience"] = netflix_data["rating"].replace(mapping_audience)

netflix_data["target_audience"].unique()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x="target_audience", data=netflix_data)

In [None]:
sns.countplot(x="target_audience", data=netflix_data, hue="type")

* Haufigkeit Schauspieler
* Länder mit den meisten Filmen/Serien (USA, Indien, Japan)
* Aktivität einzelner Schauspieler nach Jahren
* Serien mit mehr als 3 Staffeln (viele mit 1ner Staffel, ...)
* Dauer von Filmen (nach Jahren) (werden länger)
* Genre nach Jahren (beliebtesten Genres)

## Länder

In [None]:
netflix_data["country_main"] = netflix_data["country"].apply(lambda x: x.split(",")[0])

In [None]:
netflix_data["country_main"].unique()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x="country_main", data=netflix_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
netflix_data["country_main"].value_counts()[:10]

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x="country_main", 
              data=netflix_data, 
              order=netflix_data["country_main"].value_counts()[:15].index,
             hue="type")
plt.xticks(rotation=90)
plt.show()

In [None]:
netflix_data["rest_der_welt"] = np.where(netflix_data["country_main"]=="United States", "US", "Rest der Welt")

In [None]:
netflix_data.head()

In [None]:
sns.countplot(x="rest_der_welt", data=netflix_data)

In [None]:
sns.countplot(x="target_audience", data=netflix_data, hue="rest_der_welt")

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x="release_year", data=netflix_data[netflix_data["release_year"]>=2000] ,hue="rest_der_welt")

## Duration / Länge

In [None]:
movie_data = netflix_data[netflix_data["type"]=="Movie"]

In [None]:
movie_data["duration_int"] = movie_data["duration"].str.replace(" min", "")
movie_data["duration_int"] = movie_data["duration_int"].astype(str).astype(int)

In [None]:
movie_data.info()

In [None]:
movie_data["duration_int"].hist(bins=25)

In [None]:
movie_data["duration_int"].min()

In [None]:
duration_year = movie_data.groupby(["release_year"])["duration_int"].mean()
duration_year

In [None]:
sns.lineplot(x=duration_year.index, y=duration_year.values)

## To Do s

1. Genres analysieren (ähnlich bzw. gleich wie Länder)
1. Serien analysieren (ähnlich wie duration bei Film)

## Duration Serien

In [None]:
serien_data = netflix_data[netflix_data["type"]=="TV Show"]

In [None]:
serien_data["duration_int"] = serien_data["duration"].str.replace(" Season", "")
serien_data["duration_int"] = serien_data["duration_int"].str.replace("s", "")
serien_data["duration_int"] = serien_data["duration_int"].astype(str).astype(int)

In [None]:
serien_data["duration_int"].max()

In [None]:
serien_data["duration_int"].hist(bins=16)

In [None]:
top_20_serien = serien_data.sort_values(by="duration_int", ascending=False)[0:20]
plt.figure(figsize=(15,7))
sns.barplot(x="title", y="duration_int", data=top_20_serien)
plt.xticks(rotation=90)
plt.show()

In [None]:
duration_year = serien_data.groupby(["release_year"])["duration_int"].mean()
sns.lineplot(x=duration_year.index, y=duration_year.values)

## Genres

In [None]:
netflix_data["genre"] = netflix_data["listed_in"].apply(lambda x: x.split(",")[0])

In [None]:
netflix_data["genre"].value_counts()[0:10].index

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x="genre", data=netflix_data, order=netflix_data["genre"].value_counts()[0:10].index)
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x="genre", data=netflix_data, order=netflix_data["genre"].value_counts()[0:10].index, hue="rest_der_welt")
plt.xticks(rotation=45)
plt.show()

## Länge nach Genre

In [None]:
movie_data["genre"] = movie_data["listed_in"].apply(lambda x: x.split(",")[0])
duration_genre = movie_data.groupby("genre")["duration_int"].mean()
duration_genre.sort_values()

In [None]:
sns.barplot(x=duration_genre.sort_values().index, y=duration_genre.sort_values().values)
plt.xticks(rotation=90)
plt.show()

## Längster und kürzester Film

In [None]:
duration_min = movie_data["duration_int"].min()
duration_max = movie_data["duration_int"].max()

print(duration_min, duration_max)

In [None]:
movie_data[movie_data["duration_int"] == duration_max]

In [None]:
movie_data[movie_data["duration_int"] == duration_min]

# Recommender

implementing a content-based recommender system

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

text = ["The sky is blue", 
        "The sun is bright", 
        "The sky is bright", 
        "The sun is yellow"]

countVectorizer = CountVectorizer(stop_words="english")
tfidfVectorizer = TfidfVectorizer(stop_words="english")

count_wm = countVectorizer.fit_transform(text)
tfidf_wm = tfidfVectorizer.fit_transform(text)

count_tokens = countVectorizer.get_feature_names()
tfidf_tokens = tfidfVectorizer.get_feature_names()

df_count = pd.DataFrame(
    data = count_wm.toarray(), index = ["Dok 1", "Dok 2", "Dok 3", "Dok 4"],
    columns = count_tokens
)
df_tfidf = pd.DataFrame(
    data = tfidf_wm.toarray(), index = ["Dok 1", "Dok 2", "Dok 3", "Dok 4"],
    columns = tfidf_tokens
)

print("Count Vectorizer\n")
print(df_count)
print("TFIDF Vectorizer\n")
print(df_tfidf)

In [None]:
netflix_data.head()
features = ["listed_in", "cast", "director", "description", "title"] # "target_audience", "type"
lists = ["listed_in", "cast", "director"]
text = ["description", "title"]

netflix_recommender = netflix_data[features]
netflix_recommender.head()

In [None]:
def clean_data_list(x):
    return str.lower(x.replace(" ", ""))

def clean_data_text(x):
    return str.lower(x)

In [None]:
for feature in lists:
    netflix_recommender[feature] = netflix_recommender[feature].apply(clean_data_list)
    
for feature in text:
    netflix_recommender[feature] = netflix_recommender[feature].apply(clean_data_text)

In [None]:
netflix_recommender.head()

In [None]:
netflix_recommender.shape

In [None]:
netflix_recommender["full_text"] = netflix_recommender["director"] + " " + netflix_recommender["cast"] + " " + netflix_recommender["listed_in"] + " " + netflix_recommender["description"]

In [None]:
netflix_recommender["full_text"][0]

In [None]:
count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(netflix_recommender["full_text"])

In [None]:
count_matrix.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
cosine_sim.shape

In [None]:
cosine_sim

In [None]:
netflix_recommender = netflix_recommender.reset_index()

indices = pd.Series(netflix_recommender.index, index=netflix_recommender["title"])
indices

In [None]:
def get_recommendation(title):
    title = title.lower() # Breaking Bad -> breaking bad
    idx = indices[title] # breaking bad -> 42
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    # [(0, 1.00),
    #  (1, 0.01),
    #  (2, 0.24)]
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:11]
    
    movie_indices = [i[0] for i in sim_scores]
    
    return netflix_data["title"].iloc[movie_indices]

In [None]:
get_recommendation("Breaking Bad")