In [1]:
import networkx as nx
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


In [2]:
G = nx.read_gml("../results/alzheimers_network.gml")


In [3]:
edges = list(G.edges())


In [4]:
non_edges = list(nx.non_edges(G))

# Sample same number as edges (balance)
np.random.seed(42)
non_edges = np.random.choice(
    len(non_edges),
    size=len(edges),
    replace=False
)

neg_edges = [list(nx.non_edges(G))[i] for i in non_edges]


In [5]:
def compute_features(G, edge_list):
    features = []

    for u, v in edge_list:
        cn = len(list(nx.common_neighbors(G, u, v)))
        jc = next(nx.jaccard_coefficient(G, [(u, v)]))[2]
        pa = next(nx.preferential_attachment(G, [(u, v)]))[2]

        features.append([cn, jc, pa])

    return np.array(features)


In [6]:
X_pos = compute_features(G, edges)
X_neg = compute_features(G, neg_edges)

X = np.vstack([X_pos, X_neg])
y = np.array([1]*len(X_pos) + [0]*len(X_neg))


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [8]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred = lr.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred)


0.9906880882830249

In [9]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict_proba(X_test)[:,1]

roc_auc_score(y_test, y_pred_rf)


0.9882278481012658

In [10]:
candidate_edges = neg_edges[:1000]   # subset for speed
X_candidates = compute_features(G, candidate_edges)

scores = rf.predict_proba(X_candidates)[:,1]

predicted_links = pd.DataFrame({
    "gene1": [e[0] for e in candidate_edges],
    "gene2": [e[1] for e in candidate_edges],
    "score": scores
}).sort_values("score", ascending=False)

predicted_links.head(10)


Unnamed: 0,gene1,gene2,score
821,10023809050,10025909172,1.0
505,10023812264,10023833115,1.0
976,10023823479,10023814510,1.0
48,10023820660,10025909172,1.0
362,10023828170,10023816224,0.975
908,10025929556,10023812264,0.965
880,10025910121,10023815266,0.950833
464,10023827916,10025907055,0.93
458,10023824376,10025909172,0.925
662,10025906620,10023810409,0.910417


In [11]:
predicted_links.to_csv("../results/predicted_gene_interactions.csv", index=False)


KeyError: 'gene_1'