In [1]:
from collections import defaultdict
from itertools import combinations
from pathlib import Path
from gensim.models import Word2Vec
import networkx as nx
import pandas as pd
import numpy as np
from matplotlib import pyplot

In [2]:
# Reads csv file into a pandas dataframe
articles = pd.read_csv(Path("data") / "articles.csv")

# Adds a new columns called node_id which corresponds to the index
articles["node_id"] = articles.index

# Make List into an array
articles["lists"] = articles["lists"].str.split("; ")

### REMOVE THIS BEFORE SUBMISSION HERE I AM TAKING A SAMPLE
#articles = articles.sample(n=10, random_state=42)
test_data = pd.read_csv(Path("data") / "test_data.csv")
train_data = pd.read_csv(Path("data") / "train_data.csv")

In [3]:
medium_graph = nx.Graph()
medium_graph.add_nodes_from(articles["node_id"].to_list())

list_to_nodes = defaultdict(set)
for _, row in articles[["node_id", "lists"]].iterrows():
    for l in row["lists"]:
        list_to_nodes[l].add(row["node_id"])

for node_ids in list_to_nodes.values():
    medium_graph.add_edges_from(combinations(node_ids, 2))

Let us make the walks now to get the node embeddings. 

In [13]:
# Performing Random Walk
def random_walks(graph: nx.Graph, num_walks: int, walk_length: int) -> np.ndarray:
    result = []

    for node in graph.nodes():
        for i in range(num_walks):
            walk = [node]
            for j in range(walk_length - 1):
                current_node = walk[-1]
                neighbors_list = list(graph.neighbors(current_node))
                
                if len(neighbors_list) == 0:
                    walk.append(node)
                    continue        

                # Randomly choose a neighbor
                index = np.random.randint(len(neighbors_list))
                next_node = neighbors_list[index]
                walk.append(next_node)
                
             # Pad shorter walks with None as we want to create equal length np arrays   
            if len(walk) < walk_length:
                walk.extend([node] * (walk_length - len(walk)))
            result.append(walk)

    result = [[str(w) for w in walk] for walk in result]
    return result


In [15]:
print(medium_graph)

depth = 100
walks = random_walks(medium_graph, num_walks= 5, walk_length = depth)



Graph with 27718 nodes and 2014162 edges


Now let us use the word2vec model to embed these walks. 

In [17]:
# We can take the context vector to be the mode at each index in the walk.

model = Word2Vec(
    walks,
    vector_size=128,  # Dimensionality of the embeddings
    window=5,         # Context window size
    min_count=0,      # Ignore words with frequency below this
    sg=1,             # Use skip-gram model
    workers=4,        # Number of threads to use
    epochs=10         # Number of iterations over the corpus
)

model.save("word2vec_model.model")