In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
data.describe()

In [None]:
data["date_added"] = pd.to_datetime(data['date_added'])
data['year'] = data['date_added'].dt.year
data['month'] = data['date_added'].dt.month
data['day'] = data['date_added'].dt.day

In [None]:
data['directors']=data['director'].apply(lambda x: [] if pd.isna(x) else [i.strip() for i in x.split(',')])
data['actors']=data['cast'].apply(lambda x: [] if pd.isna(x) else [i.strip() for i in x.split(',')])
data['categories']=data['listed_in'].apply(lambda x: [] if pd.isna(x) else [i.strip() for i in x.split(',')])
data['countries']=data['country'].apply(lambda x: [] if pd.isna(x) else [i.strip() for i in x.split(',')])
data.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans
text_content = data['description']
vector = TfidfVectorizer(max_df=0.3,        
                             min_df=1,     
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True, 
                             norm=u'l2',
                             smooth_idf=True
                            )
tfidf = vector.fit_transform(text_content)
kmeans = MiniBatchKMeans(n_clusters = 200)
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = vector.get_feature_names()   
request_transform = vector.transform(data['description'])
data['cluster'] = kmeans.predict(request_transform) 
data['cluster'].value_counts().head()

In [None]:
print(request_transform)

In [None]:
print(data['cluster'])

In [None]:
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n] 

In [None]:
G=nx.Graph(label='NETFLIX')
for i,row in data.iterrows():
    G.add_node(row['title'],key=row['show_id'],label='MOVIE',mtype=row['type'],rating=row['rating'])
    for j in row['actors']:
        G.add_node(j,label='PERSON')
        G.add_edge(row['title'],j,label='ACTED_IN')
    for j in row['directors']:
        G.add_node(j,label='PERSON')
        G.add_edge(row['title'],j,label='DIRECTED')
    for j in row['categories']:
        G.add_node(j,label='CAT')
        G.add_edge(row['title'],j,label='CAT_IN')
    for j in row['countries']:
        G.add_node(j,label='COUNTRY')
        G.add_edge(row['title'],j,label='COUNTRY_IN')
for i,row in data.iterrows():
    similar=find_similar(tfidf,i,top_n=5)
    for e in similar:
        G.add_edge(row['title'],data['title'].loc[e],label='SIMILAR_TO')
    

In [None]:
G.number_of_nodes()

In [None]:
G.number_of_edges()

In [None]:
def get_all_adj_nodes(list_in):
    sub_graph=set()
    for m in list_in:
        sub_graph.add(m)
        for e in G.neighbors(m):        
                sub_graph.add(e)
    return list(sub_graph)
def draw_sub_graph(sub_graph):
    subgraph = G.subgraph(sub_graph)
    colors=[]
    for e in subgraph.nodes():
        if G.nodes[e]['label']=="MOVIE":
            colors.append('blue')
        elif G.nodes[e]['label']=="PERSON":
            colors.append('red')
        elif G.nodes[e]['label']=="CAT":
            colors.append('green')
        elif G.nodes[e]['label']=="COUNTRY":
            colors.append('yellow')
        elif G.nodes[e]['label']=="SIMILAR_TO":
            colors.append('orange')    

    nx.draw(subgraph, with_labels=True, font_weight='bold',node_color=colors)
    plt.show()

In [None]:
list_in=["Ocean's Twelve","Ocean's Thirteen"]
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [14,14]
sub_graph = get_all_adj_nodes(list_in)
draw_sub_graph(sub_graph)

In [None]:
import math as math
def get_recommendation(root):
    commons_dict = {}
    for e in G.neighbors(root):
        for e2 in G.neighbors(e):
            if e2==root:
                continue
            if G.nodes[e2]['label']=="MOVIE":
                commons = commons_dict.get(e2)
                if commons==None:
                    commons_dict.update({e2 : [e]})
                else:
                    commons.append(e)
                    commons_dict.update({e2 : commons})
    movies=[]
    weight=[]
    for key, values in commons_dict.items():
        w=0.0
        for e in values:
            w=w+1/math.log(G.degree(e))
        movies.append(key) 
        weight.append(w)
    
    result = pd.Series(data=np.array(weight),index=movies)
    result.sort_values(inplace=True,ascending=False)        
    return result

In [None]:
result = get_recommendation("Naruto Shippuden : Blood Prison")
result2 = get_recommendation("Another Miss Oh")
result3 = get_recommendation("The Devil Inside")
result4 = get_recommendation("Stranger Things")
print("*"*40+"\n Recommendation for 'Naruto Shippuden : Blood Prison'\n"+"*"*40)
print(result.head())
print("*"*40+"\n Recommendation for 'Another Miss Oh'\n"+"*"*40)
print(result2.head())
print("*"*40+"\n Recommendation for 'Belmonte'\n"+"*"*40)
print(result3.head())
print("*"*40+"\n Recommendation for 'Stranger Things'\n"+"*"*40)
print(result4.head())

In [None]:
data['title'][0:100]