### 1. Data lmport

In [None]:
import os
import numpy as np
import pandas as pd


In [None]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
data.head()

### 2. Some Exploration

#### 2.1 What content is available in different countries?

In [None]:
data["country"] = data["country"].fillna("")
country_list = [string.split(",") for string in list(data["country"])]
country_list = [[country.lstrip() for country in sublist] for sublist in country_list]
data["country_list"] = country_list

In [None]:
import itertools
all_countries = list(itertools.chain(*country_list))
all_unique_countries = list(set(all_countries))
all_unique_countries = sorted(all_unique_countries[1:])

In [None]:
import collections
country_count = pd.DataFrame(collections.Counter(all_countries).most_common(117),
                            columns=["country","count"])
country_count = country_count.drop([3])
import matplotlib.pyplot as plt
plt.bar(country_count["country"][:20],country_count["count"][:20])
plt.xticks(rotation=90)
plt.show()

The bar graph above presents the number of available netflix movies and tv shows in top 20 countries.

In [None]:
data["listed_in"] = data["listed_in"].fillna("")
genre = [string.split(",") for string in list(data["listed_in"])]
genre = [[string.lstrip() for string in sublist] for sublist in genre]
data["genre"] = genre

In [None]:
all_genre = list(itertools.chain(*genre))
all_unique_genre = sorted(list(set(all_genre)))
genre_count = pd.DataFrame(collections.Counter(all_genre).most_common(42),
                            columns=["genre","count"])

plt.bar(genre_count["genre"][:15],genre_count["count"][:15])
plt.xticks(rotation=90)
plt.show()

The bar graph above present the number of top 15 genres.

In [None]:
genre_in_country = {}
for row in range(7787):
    for country in country_list[row]:
        if country not in genre_in_country:
            genre_in_country[country] = []
        genre_in_country[country].extend(genre[row])
        genre_in_country[country] = list(set(genre_in_country[country]))  

In [None]:
# what contents are available in Egypt?
genre_in_country["Egypt"]

#### 2.2 Network analysis of actors and directors

In [None]:
cast = list(data["cast"].fillna(""))
cast = [string for string in cast if string!=""]
cast = [string.split(",") for string in cast]
cast = [[person.lstrip() for person in people] for people in cast]
all_cast = list(itertools.chain(*cast))
all_unique_cast = list(set(all_cast))

In [None]:
castdf = pd.DataFrame(collections.Counter(all_cast).most_common(),
                            columns=["actor/actress","count"])
castdf_3 = castdf.drop(castdf[castdf["count"]<3].index)

In [None]:
castdf_3.head()

Top 5 actors and the number of their works。

In [None]:
from nltk import bigrams
cast_bigram = [list(bigrams(group)) for group in cast]
cast_bigram = list(itertools.chain(*cast_bigram))
bicast_count = collections.Counter(cast_bigram)
bicast = pd.DataFrame(bicast_count.most_common(30),columns=["groups","count"])

In [None]:
# network visualization 
d = bicast.set_index('groups').T.to_dict('records')

import networkx as nx
G = nx.Graph()
for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))
               

spring_3D = nx.spring_layout(G,dim=3, seed=18)
label = list(spring_3D.keys())
x_nodes = [spring_3D[i][0] for i in label]# x-coordinates of nodes
y_nodes = [spring_3D[i][1] for i in label]# y-coordinates
z_nodes = [spring_3D[i][2] for i in label]# z-coordinates
edge_list = G.edges()
x_edges=[]
y_edges=[]
z_edges=[]

#need to fill these with all of the coordiates
for edge in edge_list:
    #format: [beginning,ending,None]
    x_coords = [spring_3D[edge[0]][0],spring_3D[edge[1]][0],None]
    x_edges += x_coords

    y_coords = [spring_3D[edge[0]][1],spring_3D[edge[1]][1],None]
    y_edges += y_coords

    z_coords = [spring_3D[edge[0]][2],spring_3D[edge[1]][2],None]
    z_edges += z_coords

import networkx as nx 
import plotly.graph_objects as go
import pandas as pd

%matplotlib inline

trace_edges = go.Scatter3d(x=x_edges,
                        y=y_edges,
                        z=z_edges,
                        mode='lines',
                        line=dict(color='black', width=2),
                        hoverinfo='none')

trace_nodes = go.Scatter3d(x=x_nodes,
                         y=y_nodes,
                        z=z_nodes,
                        mode='markers+text',
                        marker=dict(symbol='circle',
                                    size=10,
                                    line=dict(color='black', width=0.5)),
                        text=label,
                        hoverinfo='text')

axis = dict(showbackground=False,
            showline=False,
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            title='')

layout = go.Layout(title="cast network",
                width=650,
                height=625,
                showlegend=False,
                scene=dict(xaxis=dict(axis),
                        yaxis=dict(axis),
                        zaxis=dict(axis),
                        ),
                margin=dict(t=100),
                hovermode='closest')

vizdata = [trace_edges, trace_nodes]
fig = go.Figure(data=vizdata, layout=layout)

fig.show()

Clustering exists.

In [None]:
dir_cast = data[["director","cast"]]
dir_cast = dir_cast.dropna()

In [None]:
dir_cast["director"] = [string.split(",") for string in list(dir_cast["director"])]
dir_cast["director"] = [[person.lstrip() for person in people] for people in list(dir_cast["director"])]
dir_cast["cast"] = [string.split(",") for string in list(dir_cast["cast"])]
dir_cast["cast"] = [[person.lstrip() for person in people] for people in list(dir_cast["cast"])]

In [None]:
dircast_pair = []
direct = list(dir_cast["director"])
ca = list(dir_cast["cast"])
for i in range(4979):
    for director in direct[i]:
        for act in ca[i]:
            pair = [director,act]
            dircast_pair.append(pair)

In [None]:
all_director = list(itertools.chain(*direct))
unique_director = list(set(all_director))
director_count = pd.DataFrame(collections.Counter(all_director).most_common(10),
                             columns= ["director","count"])
director_count

Top 10 directors and the number of their works.

In [None]:
top3 = [pair for pair in dircast_pair if pair[0] in ["Jan Suter","Raúl Campos","Marcus Raboy"]]
top3_bigram = [list(bigrams(pair)) for pair in top3]
top3_bigram = list(itertools.chain(*top3_bigram))
top3df = pd.DataFrame(collections.Counter(top3_bigram).most_common(),
                      columns=["pair","count"])

In [None]:
# 2D network visualization
d = top3df.set_index('pair').T.to_dict('records')
G = nx.Graph()

for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))
fig, ax = plt.subplots(figsize=(10, 8))
pos = nx.spring_layout(G, k=2,seed=1)
nx.draw_networkx(G, pos,
                 font_size=16,
                 width=3,
                 edge_color='grey',
                 node_color='purple',
                 with_labels = False,
                 ax=ax)

for key, value in pos.items():
    x, y = value[0]+.135, value[1]+.045
    ax.text(x, y,
            s=key,
            bbox=dict(facecolor='red', alpha=0.25),
            horizontalalignment='center', fontsize=13) 
plt.figure(figsize = (500,500)) 
plt.figure()
plt.show()

Not pretty clear, use 3D.

In [None]:
# 3d
d = top3df.set_index('pair').T.to_dict('records')
G = nx.Graph()

for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))              

spring_3D = nx.spring_layout(G,dim=3, seed=18)
label = list(spring_3D.keys())
x_nodes = [spring_3D[i][0] for i in label]# x-coordinates of nodes
y_nodes = [spring_3D[i][1] for i in label]# y-coordinates
z_nodes = [spring_3D[i][2] for i in label]# z-coordinates
edge_list = G.edges()
x_edges=[]
y_edges=[]
z_edges=[]

for edge in edge_list:
    #format: [beginning,ending,None]
    x_coords = [spring_3D[edge[0]][0],spring_3D[edge[1]][0],None]
    x_edges += x_coords

    y_coords = [spring_3D[edge[0]][1],spring_3D[edge[1]][1],None]
    y_edges += y_coords

    z_coords = [spring_3D[edge[0]][2],spring_3D[edge[1]][2],None]
    z_edges += z_coords

%matplotlib inline

trace_edges = go.Scatter3d(x=x_edges,
                        y=y_edges,
                        z=z_edges,
                        mode='lines',
                        line=dict(color='black', width=2),
                        hoverinfo='none')

trace_nodes = go.Scatter3d(x=x_nodes,
                         y=y_nodes,
                        z=z_nodes,
                        mode='markers+text',
                        marker=dict(symbol='circle',
                                    size=10,
                                    line=dict(color='black', width=0.5)),
                        text=label,
                        hoverinfo='text')

axis = dict(showbackground=False,
            showline=False,
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            title='')

layout = go.Layout(title="top3 director-actor network",
                width=650,
                height=625,
                showlegend=False,
                scene=dict(xaxis=dict(axis),
                        yaxis=dict(axis),
                        zaxis=dict(axis),
                        ),
                margin=dict(t=100),
                hovermode='closest')

vizdata = [trace_edges, trace_nodes]
fig = go.Figure(data=vizdata, layout=layout)

fig.show()

Jan Suter and Raúl Campos are probably partners.

### 3. A simple recomendation engine

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")
data["description"] = data["description"].fillna('')
tfidf_matrix = tfidf.fit_transform(data["description"])

In [None]:
# using cosine similarity for similarity between two descriptions
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [None]:
indices = pd.Series(data.index,index=data["title"])

In [None]:
data["director"] = data["director"].fillna("")
data["director list"] = [string.split(",") for string in list(data["director"])]
data["director list"] = [[person.lstrip() for person in people] for people in list(data["director list"])]

data["cast"] = data["cast"].fillna("")
data["top3 cast"] = [string.split(",") for string in list(data["cast"])]
data["top3 cast"] = [[person.lstrip() for person in people] for people in list(data["top3 cast"])]

In [None]:
def get_top3(x):
    """ return the first 3 element of a list"""
    if len(x)>3:
        x = x[:3]
    return x
data["top3 cast"] = [get_top3(l) for l in list(data["top3 cast"])]
data["top3 dir"] = [get_top3(l) for l in list(data["director list"])]

In [None]:
def lower_no_space(x):
    if x == [""]:
        return x
    else:
        return [s.replace(" ","").lower() for s in x]

features = ["top3 dir","top3 cast","genre"]
for f in features:
    data[f] = [lower_no_space(l) for l in list(data[f])]

In [None]:
def join_feature(df):
    return " ".join(df["genre"])+" "+" ".join(df["top3 dir"])+" "+" ".join(df["top3 cast"])
data['rec_feature'] = data.apply(join_feature,axis=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = "english")
count_matrix = count.fit_transform(data["rec_feature"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
def recommender(title,sim1 = cosine_sim,sim2=cosine_sim2,n=5):
    """
    a recommender than takes a netflix show's title as input 
    and return (default) 5 other most similar netflix shows
    
    the recommendation is based on description, 3 top actors, director, and related genres
    """
    
    index = indices[title]
    sim_scores1 = list(enumerate(cosine_sim[index]))
    sim_scores2 = list(enumerate(cosine_sim2[index]))
    length = len(sim_scores1)
    total_scores = [(i,sim_scores1[i][1]+sim_scores2[i][1]) for i in range(length)]
    total_scores = sorted(total_scores,key = lambda x:x[1],reverse=True)
    
    top_n = total_scores[1:1+n]
    top_n_index = [tup[0] for tup in top_n]
    
    return data["title"].iloc[top_n_index]
    
    

In [None]:
recommender("13 Reasons Why")