In [1]:
import networkx as nx
import pandas as pd
import random
from tqdm import tqdm

In [2]:
G = nx.read_adjlist("data/ca-CondMat.txt")

In [31]:
node = sorted(G.degree, key=lambda x: x[1], reverse=True)[0][0]

In [39]:
nodes = list(nx.single_source_shortest_path_length(G, node, cutoff=2).keys())

In [40]:
G0 = G.subgraph(nodes)

In [41]:
G0.number_of_nodes(), G0.number_of_edges()

(3403, 19054)

In [42]:
data = random.sample(G0.edges, round(G0.number_of_edges()*0.3))

In [43]:
G0 = nx.Graph(G0)

In [44]:
G0.remove_edges_from(data)

In [46]:
pairs = []
for node in tqdm(G0.nodes):
    kneighbors = nx.single_source_shortest_path_length(G0, node, cutoff=2)
    for n in kneighbors:
        if kneighbors[n] == 2 and (n, node) not in pairs:
            pairs.append((node, n))

100%|██████████| 3403/3403 [12:14<00:00,  4.63it/s]


In [47]:
degree_i = []
degree_j = []
common_neighbors = []

for pair in pairs:
    degree_i.append(G0.degree(pair[0]))
    degree_j.append(G0.degree(pair[1]))
    common_neighbors.append(len(list(nx.common_neighbors(G0, pair[0], pair[1]))))

features = pd.DataFrame({
    'degree_i': degree_i,
    'degree_j': degree_j,
    'common_neighbors': common_neighbors
},columns=['degree_i', 'degree_j', 'common_neighbors'], index=pairs)

In [48]:
features

Unnamed: 0,degree_i,degree_j,common_neighbors
"(107793, 35010)",23,32,3
"(107793, 91051)",23,20,1
"(107793, 74122)",23,16,1
"(107793, 35142)",23,12,1
"(107793, 74551)",23,9,1
...,...,...,...
"(87782, 34676)",7,8,1
"(89688, 92533)",20,15,1
"(47954, 42154)",7,5,1
"(92533, 56124)",15,10,1


In [49]:
label = []
for i, row in features.iterrows():
    if i in data or (i[1], i[0]) in data:
        label.append(1)
    else:
        label.append(0)
features['label'] = label

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [51]:
X_train, X_test, y_train, y_test = train_test_split(features[['degree_i', 'degree_j', 'common_neighbors']].values, features['label'], test_size=0.3)

In [52]:
clf = LogisticRegression().fit(X_train, y_train)

In [53]:
y_pred = clf.predict(X_test)

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     48964
           1       0.71      0.28      0.40      1479

    accuracy                           0.98     50443
   macro avg       0.84      0.64      0.69     50443
weighted avg       0.97      0.98      0.97     50443

