In [44]:
import networkx as nx
import random
random.seed(0)
import numpy as np
np.random.seed(0)

G = nx.erdos_renyi_graph(10, 0.3, seed=1, directed=False)

In [8]:
def next_node(previous, current, p, q):
    neighbors = list(G.neighbors(current))
    alphas = []
    for neighbor in neighbors:
        if neighbor == previous:  # previous = i node, depth first search
            alpha = 1/p
        elif G.has_edge(neighbor, previous):  #
            alpha = 1
        else:                     #
            alpha = 1/q
        alphas.append(alpha)
    probs = [alpha / sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors, size=1, p=probs)[0]
    return next

def random_walk(start, length, p, q):
    walk = [start]
    for i in range(length):
        current = walk[-1]
        previous = walk[-2] if len(walk) > 1 else None
        next = next_node(previous, current, p, q)
        walk.append(next)
    return [str(x) for x in walk]

In [7]:
random_walk(0, 8, p=1, q=1)

['0', '1', '6', '1', '9', '1', '2', '1', '9']

In [4]:
random_walk(0, 8, p=1, q=10)

['0', '9', '1', '9', '1', '9', '1', '0', '1']

In [5]:
random_walk(0, 8, p=10, q=1)

['0', '1', '9', '4', '7', '8', '7', '4', '6']

In [9]:
#패키지 설치
!pip install fastapi
!pip install kaleido
!pip install python-multipart
!pip install uvicorn



In [10]:
!pip install gensim==3.8.0

Collecting gensim==3.8.0
  Downloading gensim-3.8.0.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25l[?25hdone
  Created wheel for gensim: filename=gensim-3.8.0-cp310-cp310-linux_x86_64.whl size=24680956 sha256=8c28aadbb54b4d59c3c2781fa5ed7e5befaee6902fa6470b48b63622fc33824e
  Stored in directory: /root/.cache/pip/wheels/01/ab/bf/07d4e3462e9f8d17d98c2fb5e870d7e9934c8482a32fc3ef41
Successfully built gensim
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.2
    Uninstalling gensim-4.3.2:
      Successfully uninstalled gensim-4.3.2
Successfully installed gensim-3.8.0


In [11]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.0
    Uninstalling gensim-3.8.0:
      Successfully uninstalled gensim-3.8.0
Successfully installed gensim-4.3.2


In [49]:
!pip install gensim



In [12]:
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [13]:
import networkx as nx
G = nx.karate_club_graph()

In [14]:
labels = []
for node in G.nodes:
 label = G.nodes[node]['club']
 labels.append(1 if label == 'Officer' else 0)

In [15]:
walks = []
for node in G.nodes:
 for _ in range(80):
  walks.append(random_walk(node, 10, 3, 2)) #

In [51]:
walks

[['0', '8', '30', '1', '17', '0', '11', '0', '13', '2', '3'],
 ['0', '13', '0', '11', '0', '4', '0', '6', '4', '6', '4'],
 ['0', '31', '0', '4', '6', '5', '0', '8', '2', '0', '2'],
 ['0', '12', '0', '3', '2', '27', '2', '28', '2', '32', '20'],
 ['0', '31', '28', '33', '8', '2', '0', '5', '6', '0', '6'],
 ['0', '2', '13', '2', '1', '7', '0', '10', '5', '0', '12'],
 ['0', '3', '12', '0', '3', '7', '0', '17', '0', '12', '3'],
 ['0', '13', '33', '15', '32', '23', '29', '26', '33', '20', '33'],
 ['0', '13', '1', '19', '0', '21', '1', '21', '0', '13', '2'],
 ['0', '31', '28', '31', '32', '2', '7', '2', '1', '13', '2'],
 ['0', '3', '2', '9', '33', '26', '29', '33', '23', '32', '20'],
 ['0', '7', '3', '12', '0', '2', '28', '33', '22', '32', '33'],
 ['0', '3', '7', '0', '19', '33', '26', '29', '23', '32', '22'],
 ['0', '13', '3', '13', '3', '0', '6', '5', '0', '10', '0'],
 ['0', '4', '0', '17', '0', '5', '16', '6', '0', '3', '7'],
 ['0', '11', '0', '21', '1', '13', '2', '13', '1', '3', '1'],
 [

In [16]:
node2vec = Word2Vec(walks,
                    hs=1, # Hierarchical softmax
                    sg=1, # Skip-gram
                    vector_size=100,
                    window=10,
                    workers=2,
                    min_count=1,
                    seed=0)



In [17]:
node2vec.train(walks, total_examples=node2vec.corpus_count, epochs=30, report_delay=1)



(186238, 897600)

In [18]:
train_mask = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]
train_mask_str = [str(x) for x in train_mask]
test_mask = [0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
23, 25, 26, 27, 28, 29, 30, 31, 32, 33]
test_mask_str = [str(x) for x in test_mask]
labels = np.array(labels)

In [19]:
clf = RandomForestClassifier(random_state=0)
clf.fit(node2vec.wv[train_mask_str], labels[train_mask])

In [20]:
y_pred = clf.predict(node2vec.wv[test_mask_str])
acc = accuracy_score(y_pred, labels[test_mask])
print(f'Node2Vec accuracy = {acc*100:.2f}%')

Node2Vec accuracy = 100.00%


In [21]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')

In [22]:
import pandas as pd
ratings = pd.read_csv('/content/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'])
ratings


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [23]:
movies = pd.read_csv('/content/ml-100k/u.item', sep='|', usecols=range(2), names=['movie_id', 'title'], encoding='latin-1')

In [31]:
ratings = ratings[ratings.rating >= 4]
ratings


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


In [32]:
from collections import defaultdict
pairs = defaultdict(int)


In [33]:
for group in ratings.groupby("user_id"):
  user_movies = list(group[1]["movie_id"])
  for i in range(len(user_movies)):
    for j in range(i+1, len(user_movies)):
        pairs[(user_movies[i], user_movies[j])] += 1


In [34]:
import networkx as nx
G = nx.Graph()
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]

In [35]:
if score >= 20:
  G.add_edge(movie1, movie2, weight=score)

In [36]:
!pip install node2vec
from node2vec import Node2Vec



In [37]:
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)

Computing transition probabilities: 0it [00:00, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:00<00:00, 161599.08it/s]


In [38]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

RuntimeError: you must first build vocabulary before training the model

In [None]:
def recommend(movie):
    movie_id = str(movies[movies.title == movie].movie_id.values[0])
    for id in model.wv.most_similar(movie_id)[:5]:
      title = movies[movies.movie_id == int(id[0])].title.values[0]
    print(f'{title}: {id[1]:.2f}')
recommend('Star Wars (1977)')