In [20]:
import networkx as nx
import pandas as pd
import random
from tqdm import tqdm

In [35]:
G = nx.read_adjlist("data/ca-CondMat.txt")

In [39]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[1])

In [42]:
data = random.sample(G0.edges, round(G0.number_of_edges()*0.3))

In [44]:
G0 = nx.Graph(G0)

In [45]:
G0.remove_edges_from(data)

In [36]:
nx.number_connected_components(G)

567

In [48]:
features = pd.DataFrame(columns=['source', 'target'])

In [61]:
pairs = []
for node in tqdm(G0.nodes):
    kneighbors = nx.single_source_shortest_path_length(G0, node, cutoff=2)
    for n in kneighbors:
        if kneighbors[n] == 2 and (n, node) not in pairs:
            pairs.append((node, n))

100%|██████████| 21/21 [00:00<00:00, 10512.04it/s]


In [62]:
degree_i = []
degree_j = []
common_neighbors = []

for pair in pairs:
    degree_i.append(G0.degree(pair[0]))
    degree_j.append(G0.degree(pair[1]))
    common_neighbors.append(len(list(nx.common_neighbors(G0, pair[0], pair[1]))))

features = pd.DataFrame({
    'degree_i': degree_i,
    'degree_j': degree_j,
    'common_neighbors': common_neighbors
},columns=['degree_i', 'degree_j', 'common_neighbors'], index=pairs)

In [63]:
features

Unnamed: 0,degree_i,degree_j,common_neighbors
"(80795, 73439)",13,10,8
"(80795, 38381)",13,12,11
"(80795, 104883)",13,14,11
"(80795, 51144)",13,11,9
"(80795, 23192)",13,13,11
...,...,...,...
"(80713, 50833)",14,14,11
"(99884, 23192)",13,13,9
"(99884, 43874)",13,2,1
"(73927, 50833)",12,14,9


In [64]:
data

[('80795', '73439'),
 ('73439', '38381'),
 ('6356', '80713'),
 ('103064', '73927'),
 ('51144', '103064'),
 ('38381', '77217'),
 ('80795', '104883'),
 ('52775', '77217'),
 ('103584', '103064'),
 ('63926', '103064'),
 ('48646', '77217'),
 ('6356', '52775'),
 ('10711', '73290'),
 ('80795', '51144'),
 ('103584', '63926'),
 ('80713', '50833'),
 ('103584', '73927'),
 ('103584', '10711'),
 ('77217', '80713'),
 ('6356', '103584'),
 ('73439', '50833'),
 ('48646', '103584'),
 ('48646', '103064'),
 ('63926', '104883'),
 ('59094', '80713'),
 ('51144', '50833'),
 ('73439', '59094'),
 ('38381', '52775'),
 ('80795', '38381'),
 ('63926', '51144'),
 ('104883', '23192'),
 ('77217', '103064'),
 ('52775', '73927'),
 ('52775', '103064'),
 ('38381', '51144'),
 ('73439', '103064'),
 ('103584', '77217'),
 ('10711', '23192'),
 ('73439', '99884'),
 ('10711', '73927'),
 ('52775', '99884'),
 ('38381', '23192'),
 ('59094', '99884'),
 ('99884', '23192'),
 ('73439', '10711'),
 ('80795', '23192'),
 ('73927', '50833')

In [65]:
label = []
for i, row in features.iterrows():
    if i in data or (i[1], i[0]) in data:
        label.append(1)
    else:
        label.append(0)
features['label'] = label

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [70]:
X_train, X_test, y_train, y_test = train_test_split(features[['degree_i', 'degree_j', 'common_neighbors']].values, features['label'], test_size=0.3)

In [72]:
clf = LogisticRegression().fit(X_train, y_train)

In [76]:
y_pred = clf.predict(X_test)

In [78]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00        17

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

