<a href="https://colab.research.google.com/github/shahchhatru/AI_colab_notebooks/blob/main/Node2VecGNN_chapter4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Improving Embeddings with Biased Random Walks in Node2Vec


In [1]:
## We will use the artitecture to build a Movie RecSys



In [6]:
#Zachery karate Club datasets
import networkx as nx
import random
random.seed(0)
import numpy as np
np.random.seed(0)


G=nx.erdos_renyi_graph(10,0.3,seed=1,directed=False)


In [19]:
def next_node(previous,current,p,q):
  neighbors=list(G.neighbors(current))
  alphas=[]
  for neighbor in neighbors:
    if neighbor == previous:
      alpha=1/p
    elif G.has_edge(neighbor,previous):
      alpha=1
    else:
      alpha =1/q
    alphas.append(alpha)

  probs=[alpha/sum(alphas) for alpha in alphas]
  next=np.random.choice(neighbors,size=1,p=probs)
  return next[0]


In [20]:
next_node(None,0,1,1)

9

In [24]:
def random_walk(start,length,p,q):
  walk=[start]
  for i in range(length):
    current=walk[-1]
    # print("current",current)
    previous=walk[-2] if len(walk) > 1 else None
    # print("previous",previous)
    next =next_node(previous,current,p,q)
    # print("next",next)
    walk.append(next)
    # print(walk)

  return [str(x) for x in walk]

In [26]:
random_walk(0,8,p=1,q=1)

['0', '4', '0', '4', '6', '4', '9', '4', '5']

In [29]:
!pip install gensim




In [30]:
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [31]:
G=nx.karate_club_graph()

In [32]:
labels=[]
for node in G.nodes:
  label=G.nodes[node]['club']
  labels.append(1 if label=='Officer' else 0)



In [34]:
labels,len(labels)

([0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 34)

In [35]:
walks =[]
for node in G.nodes:
  for _ in range(80):
    walks.append(random_walk(node,10,3,2))

In [36]:
node2vec = Word2Vec(walks,hs=1,sg=1,vector_size=100,window=10,workers=2,min_count=1,seed=0)



In [38]:
node2vec.train(walks,total_examples=node2vec.corpus_count,epochs=30,report_delay=1)



(186224, 897600)

In [39]:
train_mask = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]
train_mask_str = [str(x) for x in train_mask]
test_mask = [0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
23, 25, 26, 27, 28, 29, 30, 31, 32, 33]
test_mask_str = [str(x) for x in test_mask]
labels = np.array(labels)

In [40]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [42]:
#. The Random Forest classifier is trained on the training data:

In [41]:
clf = RandomForestClassifier(random_state=0)
clf.fit(node2vec.wv[train_mask_str], labels[train_mask])

In [43]:
# we evaluate the accuracy

y_pred = clf.predict(node2vec.wv[test_mask_str])
acc = accuracy_score(y_pred, labels[test_mask])
print(f'Node2Vec accuracy = {acc*100:.2f}%')

Node2Vec accuracy = 90.91%


## Let's build a movie Recommendation

One of the most popular applications of GNNs is RecSys. If you think about the foundation of Word2Vec
(and, thus, DeepWalk and Node2Vec), the goal is to produce vectors with the ability to measure their
similarity. Encode movies instead of words, and you can suddenly ask for movies that are the most
similar to a given input title. It sounds a lot like a RecSys, right?
But how to encode movies? We want to create (biased) random walks of movies, but this requires a
graph dataset where similar movies are connected to each other. This is not easy to find.
Another approach is to look at user ratings. There are different techniques to build a graph based on
ratings: bipartite graphs, edges based on pointwise mutual information, and so on. In this section, we’ll
implement a simple and intuitive approach: movies that are liked by the same users are connected.
We’ll then use this graph to learn movie embeddings using Node2Vec:



In [45]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

url='https://files.grouplens.org/datasets/movielens/ml-100k.zip'

with urlopen(url) as zurl:
 with ZipFile(BytesIO(zurl.read())) as zfile:
  zfile.extractall('.')

In [46]:
import pandas as pd
ratings = pd.read_csv('ml-100k/u.data', sep='\t',names=['user_id', 'movie_id', 'rating', 'unix_timestamp'])
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [47]:
movies = pd.read_csv('ml-100k/u.item', sep='|',
usecols=range(2), names=['movie_id', 'title'],
encoding='latin-1')

In [48]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [49]:
ratings = ratings[ratings.rating >= 4]
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


In [67]:
from collections import defaultdict
pairs = defaultdict(int)

In [68]:
for group in ratings.groupby("user_id"):
  user_movies = list(group[1]["movie_id"])
  for i in range(len(user_movies)):
    for j in range(i+1, len(user_movies)):
      pairs[(user_movies[i], user_movies[j])]+=1

In [69]:
pairs

defaultdict(int,
            {(61, 33): 4,
             (61, 160): 6,
             (61, 20): 4,
             (61, 202): 4,
             (61, 171): 6,
             (61, 265): 8,
             (61, 47): 5,
             (61, 222): 2,
             (61, 253): 1,
             (61, 113): 3,
             (61, 227): 3,
             (61, 90): 2,
             (61, 64): 10,
             (61, 228): 8,
             (61, 121): 4,
             (61, 114): 2,
             (61, 132): 9,
             (61, 134): 10,
             (61, 98): 13,
             (61, 186): 9,
             (61, 221): 5,
             (61, 84): 1,
             (61, 60): 18,
             (61, 177): 7,
             (61, 174): 12,
             (61, 82): 5,
             (61, 56): 12,
             (61, 80): 1,
             (61, 229): 2,
             (61, 235): 2,
             (61, 6): 2,
             (61, 206): 2,
             (61, 76): 1,
             (61, 72): 2,
             (61, 185): 11,
             (61, 96): 8,
             (61, 25

In [71]:
G=nx.Graph()
for pair in pairs:
  movie1,movie2=pair
  score=pairs[pair]
  if score >= 20:
    print(pair)
    G.add_edge(movie1, movie2, weight=score)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(83, 151)
(83, 318)
(83, 735)
(385, 282)
(54, 64)
(54, 181)
(58, 237)
(58, 203)
(58, 151)
(58, 181)
(125, 202)
(125, 471)
(125, 237)
(125, 64)
(125, 294)
(125, 151)
(125, 71)
(125, 174)
(161, 64)
(161, 318)
(161, 735)
(202, 237)
(202, 402)
(202, 419)
(202, 294)
(202, 318)
(202, 71)
(202, 282)
(202, 735)
(471, 294)
(471, 151)
(237, 203)
(237, 64)
(237, 294)
(237, 181)
(237, 282)
(237, 735)
(237, 411)
(501, 181)
(402, 66)
(402, 64)
(402, 181)
(419, 203)
(419, 135)
(419, 151)
(419, 318)
(419, 181)
(419, 136)
(66, 64)
(66, 318)
(66, 181)
(66, 174)
(203, 151)
(203, 282)
(135, 151)
(135, 735)
(64, 136)
(239, 318)
(523, 151)
(294, 151)
(294, 132)
(294, 181)
(151, 735)
(318, 136)
(50, 969)
(50, 73)
(50, 2)
(50, 136)
(50, 735)
(71, 181)
(969, 174)
(132, 136)
(132, 735)
(181, 73)
(181, 2)
(181, 136)
(181, 735)
(181, 591)
(174, 73)
(174, 2)
(174, 735)
(137, 151)
(137, 318)
(137, 210)
(137, 98)
(137, 86)
(137, 248)
(203, 328)
(203, 1

In [72]:
G.number_of_nodes()

410

In [73]:
G.number_of_edges()

14936

In [74]:
!pip install node2vec
from node2vec import Node2Vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    Uninstalling networkx-3.2.1:
      Successfully uninstalled networkx-3.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed networkx-2.8.8 node2vec-0.4.6


In [75]:
node2vec = Node2Vec(G, dimensions=64, walk_length=20,
num_walks=200, p=2, q=1, workers=1)

Computing transition probabilities:   0%|          | 0/410 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:58<00:00,  3.41it/s]


In [76]:
model = node2vec.fit(window=10, min_count=1,
batch_words=4)

In [77]:
def recommend(movie):
    movie_id = str(movies[movies.title == movie].movie_id.values[0])
    for id in model.wv.most_similar(movie_id)[:5]:
        title = movies[movies.movie_id == int(id[0])].title.values[0]
        print(f'{title}: {id[1]:.2f}')

recommend('Star Wars (1977)')

Return of the Jedi (1983): 0.61
Raiders of the Lost Ark (1981): 0.55
Godfather, The (1972): 0.49
Indiana Jones and the Last Crusade (1989): 0.46
White Squall (1996): 0.44
