In [6]:
import json
import networkx as nx
import os

In [None]:
graph = nx.Graph()

def build_graph_from_json_file(filepath):
    
    with open(filepath, 'r') as file:
        data = json.load(file)
        
    graph = nx.Graph()
    package_name = os.path.basename(filepath).split('.')[0]
    graph.add_node(package_name)

    for dependency in data["dependencies"]:
        dep_name = dependency["package"]
        graph.add_node(dep_name)
        graph.add_edge(package_name, dep_name)
    
    return graph

for json_file in os.listdir('../doku/releases'):
    file_path = os.path.join('../doku/releases', json_file)
    subgraph = build_graph_from_json_file(file_path)
    graph = nx.compose(graph, subgraph)

In [57]:
print(graph)

Graph with 14577 nodes and 24936 edges


In [14]:
from node2vec import Node2Vec

In [60]:
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)

Computing transition probabilities:   0%|          | 0/14577 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [07:29<00:00,  8.99s/it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [07:29<00:00,  8.98s/it]
Generating walks (CPU: 3): 100%|██████████| 50/50 [07:27<00:00,  8.96s/it]
Generating walks (CPU: 4): 100%|██████████| 50/50 [07:29<00:00,  8.98s/it]


In [61]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [62]:
model.wv['phoenix']

array([ 0.03047263, -0.20463394, -0.0277075 , -0.03843872, -0.01339973,
       -0.2515752 ,  0.24339251, -0.1508195 , -0.5072062 ,  0.56847715,
        0.21083727,  0.18546042, -0.17352657, -0.42935583,  0.1894547 ,
        0.05550739, -0.30858368,  0.02045539, -0.37210184,  0.04115997,
        0.45696118,  0.07119154, -0.12255816, -0.2584199 , -0.02092515,
        0.09527466, -0.47741055, -0.7302886 ,  0.40823767, -0.24472226,
       -0.19681983, -0.03465886,  0.2884595 , -0.3982887 , -0.27729315,
       -0.11579079,  0.20039761,  0.3759723 , -0.30424842, -0.05449288,
        0.0329129 ,  0.42536002, -0.12495282, -0.11004556,  0.14884827,
        0.09128992,  0.09284161, -0.20403126,  0.28083038,  0.33326608,
        0.18805982, -0.15879045,  0.28900728, -0.11530187,  0.21019106,
        0.73954374,  0.19320759,  0.06515587, -0.06764857,  0.27834067,
       -0.2270348 , -0.16133091, -0.41235122, -0.13329707], dtype=float32)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
cosine_similarity([model.wv['jason']], [model.wv['benchee']])

array([[0.28636217]], dtype=float32)

In [64]:
cosine_similarity([model.wv['phoenix']], [model.wv['phoenix_ecto']])

array([[0.6414313]], dtype=float32)

In [83]:
cosine_similarity([model.wv['tailwind']], [model.wv['phoenix_live_view']])

array([[0.62679625]], dtype=float32)

In [50]:
all_packages = set()
all_dependencies = set()

for json_file in os.listdir('../doku/releases'):
    file_path = os.path.join('../doku/releases', json_file)
    package_name = os.path.basename(file_path).split('.')[0]
    all_packages.add(package_name)

    with open(file_path, 'r') as file:
        data = json.load(file)
        
    for dependency in data["dependencies"]:
        all_dependencies.add(dependency["package"])

all_unique_names = all_packages.union(all_dependencies)

print(f"Total unique package names: {len(all_packages)}")
print(f"Total unique dependencies: {len(all_dependencies)}")
print(f"Total unique packages + dependencies: {len(all_unique_names)}")

Total unique package names: 14573
Total unique dependencies: 3000
Total unique packages + dependencies: 14577
