# In this notebook i'll try to create a recommendation engine using Kmeans with TF-IDF and networkx (For graphs)

**This notebook is inspired from the work of Mr.Yann Claudel from his notebook in Netflix movie , great notebook i recommend highly :**
[https://www.kaggle.com/yclaudel/recommendation-engine-with-networkx](http://)

# **Importation**

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math as math
import time 

In [None]:
df = pd.read_csv("../input/internet-articles-data-with-users-engagement/articles_data.csv")
df.drop('Unnamed: 0',axis=1,inplace=True)
print(df.shape)
df.head()

In [None]:
print(df['source_id'].unique())
print(df['source_name'].unique())

As we can see the two columns Source_id and source_name are almost the same , we'll be using only source name and there is a value in source name equals to 460 which is weird let's check the rows containing that value

In [None]:
print(df[df['source_name']=="460.0"])
#Since its all full of NAN value we'll drop this useless row
df = df[df['source_name']!="460.0"]
print("Row Dropped")
df.shape

## *In this work we're creating a Recommendation engine , we won't be interested in all columns , like the last ones about facebook sharing and stuff we dont need that , the first thing to do is creating a tf-idf clustering by description*

In [None]:
df.isna().sum()

We have 24 rows without description we'll need to clean that :

In [None]:
print(df[df['description'].isna()].isna().sum())
df = df[~df['title'].isna()]
df_2 = df.copy()

As you can see 18 of them are without contents , these 18 we'll be droping them cause we cant do something to fix them if we don't even know the content of this article

In [None]:
df = df_2.copy()
empty_desc = df[df['description'].isna()]
df = df[~df['description'].isna()]
empty_desc = empty_desc[~empty_desc['content'].isna()]
df = pd.concat([df,empty_desc],axis=0)
df.isna().sum()
# print(indexes)
# print(df.iloc[indexes[-1], : ])
# df.drop(df.index[[indexes]])
# print(df[df['description'].isna()].isna().sum())

## For the last 6 empty descriptions we're gonna use summarize from gensim library to summarize the content and save it as a description

In [None]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

In [None]:
empty_desc = df[df['description'].isna()]
df = df[~df['description'].isna()]


In [None]:
empty_desc['description'] = empty_desc.apply(lambda x:summarize(x['content'],ratio=0.5),axis=1)
#Grabing back our new descriptions
df = pd.concat([df,empty_desc],axis=0)

## Now we have our description values well clained it's time to start our Kmeans using TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #The Vector creator
from sklearn.metrics.pairwise import linear_kernel #Cosine similarity
from sklearn.cluster import MiniBatchKMeans #Kmeans Clustering Batch

In [None]:
cluster_content = df['description']
vector = TfidfVectorizer(max_df=0.5,min_df=1,stop_words="english",lowercase=True,use_idf=True,norm=u'l2',smooth_idf=True)
tfidf = vector.fit_transform(cluster_content)

In [None]:
k = 200
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = vector.get_feature_names()

### Adding our Data (Descritpions) and predict their classes :

In [None]:
request_transform = vector.transform(df['description'])
df['cluster'] = kmeans.predict(request_transform)
df['cluster'].value_counts().head()

In [None]:
df.head()

I think i won't be using this clustering column because it's so unbalanced

## Now we're Going to use Cosine Similarity to compute the similarity between docs

In [None]:
def find_similar(matrix,index,top_n=5):
    cosine_similarities = linear_kernel(matrix[index:index+1],matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n]

# **Now let's Create our graph**

#### Nodes Are : 

* Title 
* Person (Author)
* Press (Source_name)
* Cluster ( Description ) 
* Sim

#### Edges are :

* Wrote : relation between title and person
* CAT : Relation between title and Press
* Description : Relation between cluster and a movie
* Similarity in sense of description

In [None]:
G = nx.Graph(label="Article")
start_time = time.time()
for i,rowi in df.iterrows() :
    if (i > 3000) :
        continue
    if (i%1000 == 0) : 
        print("Iter  {} --- {} secondes --".format(i,time.time()-start_time))
    G.add_node(rowi['title'],key=i,label="Article")
    G.add_node(rowi['author'],label="Person")
    G.add_edge(rowi['title'],rowi['author'],label="Wrote")
    G.add_node(rowi['source_name'],label="Press")
    G.add_edge(rowi['title'],rowi['source_name'],label="CAT")
    #Similarity Node :
    indices = find_similar(tfidf, i, top_n = 5)
    snode="Sim("+rowi['title'][:15].strip()+")"        
    G.add_node(snode,label="SIMILAR")
    G.add_edge(rowi['title'], snode, label="SIMILARITY")
    for element in indices:
        G.add_edge(snode, df['title'].iloc[element], label="SIMILARITY")
print(" finish -- {} seconds --".format(time.time() - start_time))   

## Function to draw Our graph , no need to understand the details just a general idea , u can copie it and use it on other projects but change the nodes labels

In [None]:
def get_all_adj_nodes(list_in):
    sub_graph=set()
    for m in list_in:
        sub_graph.add(m)
        for e in G.neighbors(m):        
                sub_graph.add(e)
    return list(sub_graph)

def draw_sub_graph(sub_graph):
    subgraph = G.subgraph(sub_graph)
    colors=[]
    for e in subgraph.nodes():
        if G.nodes[e]['label']=="Article":
            colors.append('blue')
        elif G.nodes[e]['label']=="Person":
            colors.append('red')
        elif G.nodes[e]['label']=="Press":
            colors.append('green')
        elif G.nodes[e]['label']=="SIMILAR":
            colors.append('yellow')


    nx.draw(subgraph, with_labels=True, font_weight='bold',node_color=colors)
    plt.show()

## Checking our graph with two exemples ( Here only two so we can visualize it )

In [None]:
list_in=[df['title'].loc[1],df['title'].loc[2]]
sub_graph = get_all_adj_nodes(list_in)
draw_sub_graph(sub_graph)

## The next function is going to get the neighbors nodes in our graph and compute the weight (like degree of similarity according to the  graph ) then we're going to sort the neighbors by this weight value 

In [None]:
def get_recommendation(root):
    commons_dict = {}
    for e in G.neighbors(root):
        for e2 in G.neighbors(e):
            if e2==root:
                continue
            try :
                if G.nodes[e2]['label']=="Article":
                    commons = commons_dict.get(e2)
                    if commons==None:
                        commons_dict.update({e2 : [e]})
                    else:
                        commons.append(e)
                        commons_dict.update({e2 : commons})
            except :
                pass
    articles=[]
    weight=[]
    for key, values in commons_dict.items():
        w=0.0
        for e in values:
            w=w+1/math.log(G.degree(e))
        articles.append(key) 
        weight.append(w)
    
    result = pd.Series(data=np.array(weight),index=articles)
    result.sort_values(inplace=True,ascending=False)        
    return result;

In [None]:
result = get_recommendation(df['title'].loc[40])
print("*"*40+"\n Recommendation for :"+str(df['title'].loc[40])+"\n"+"*"*40)
print(result.head())