In [1]:
import networkx as nx
import numpy as np
import random
import scipy.stats
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

### Convert gml to edgelist.csv. Optional.

In [2]:
def gml_to_edgelist(f):
    g = nx.read_gml(f)
    nx.write_edgelist(g, 'power.csv', delimiter=',')


In [3]:
def load_data(path):
    g = nx.read_edgelist(path, delimiter=',')
    return g

def generate_non_edge_list(g):
    n = len(g.edges()) # Negative edge list size is same as positive list size
    non_edges = []
    for u in g.nodes():
        for v in g.nodes():
            if u == v: continue
            if g.has_edge(u, v): continue
            non_edges.append((u, v))
    neg_sample = random.sample(non_edges, n)
    return neg_sample

def generate_class_labels(g, edges):
    y = []
    for edge in edges:
        if g.has_edge(edge[0], edge[1]):
            y.append(1)
        else:
            y.append(0)
    return y

### Implement features

In [4]:
def adamic_adar(g, X):
    preds = nx.adamic_adar_index(g, X)
    lst = []
    for u, v, p in preds:
        lst.append(p)

    max_p = max(lst)
    return [x/max_p for x in lst]

def jaccard(g, X):
    preds = nx.jaccard_coefficient(g, X)
    lst = []
    for u, v, p in preds:
        lst.append(p)

    max_p = max(lst)
    return [x/max_p for x in lst]

def common_neighbors(g, X):
    lst = []
    for x in X:
        cn = nx.common_neighbors(g, x[0], x[1])
        lst.append(len(list(cn)))
    max_p = float(max(lst))

    return [x/max_p for x in lst]

In [5]:
def concat_features(X, features):
    lst = [[] for x in range(len(X))]
    for feature in features:
        for i in range(len(X)):
            feature_value = feature[i]
            lst[i].append(feature_value)
    return lst

### Load data

In [6]:
g = load_data('karate.csv')

### List of positive edges

In [7]:
EDGES_POSITIVE = list(g.edges())

### List of negative edges

In [8]:
EDGES_NEGATIVE = generate_non_edge_list(g)

In [9]:
EDGES = EDGES_POSITIVE + EDGES_NEGATIVE
random.shuffle(EDGES)

Y = generate_class_labels(g, EDGES)

feature1 = adamic_adar(g, EDGES)
feature2 = jaccard(g, EDGES)
feature3 = common_neighbors(g, EDGES)

features = [feature1, feature2, feature3]
feature_values = concat_features(EDGES, features)

In [10]:
print ("Total nodes", len(g.nodes()))
print ("Total edges", len(g.edges()))

Total nodes 34
Total edges 78


## Training

### Multilayer perceptron

In [11]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1)


In [12]:
random_search = RandomizedSearchCV(clf, param_distributions={
    'learning_rate': ["constant", "invscaling", "adaptive"],
    'hidden_layer_sizes': scipy.stats.randint(4, 12),
    'solver': ['lbfgs', 'sgd', 'adam'],
    'activation': ["relu", "logistic", "tanh"]})

random_search.fit(feature_values, Y)



RandomizedSearchCV(cv=None, error_score='raise',
          estimator=MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'learning_rate': ['constant', 'invscaling', 'adaptive'], 'hidden_layer_sizes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde7fb61400>, 'solver': ['lbfgs', 'sgd', 'adam'], 'activation': ['relu', 'logistic', 'tanh']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [13]:
### Validation

In [14]:
pred = cross_val_predict(random_search, feature_values, Y, cv=6)





In [15]:
print ("Accuracy:", accuracy_score(Y, pred))
precision, recall, fscore, support = precision_recall_fscore_support(Y, pred, average='binary')
print ("Precision:", precision)
print ("Recall:", recall)
print ("f-score:", fscore)

Accuracy: 0.692307692308
Precision: 0.734375
Recall: 0.602564102564
f-score: 0.661971830986


### SVM

In [16]:
clf = svm.SVC(kernel='rbf', random_state=0, gamma=1, C=1)
random_search = RandomizedSearchCV(clf, param_distributions={
    'kernel': ["rbf"],
    'gamma': np.logspace(-9, 3, 13),
    'C': np.logspace(-2, 10, 13)})
random_search.fit(feature_values, Y)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'kernel': ['rbf'], 'gamma': array([  1.00000e-09,   1.00000e-08,   1.00000e-07,   1.00000e-06,
         1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03]), 'C': array([  1.00000e-02,...0000e+05,
         1.00000e+06,   1.00000e+07,   1.00000e+08,   1.00000e+09,
         1.00000e+10])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [17]:
pred = cross_val_predict(random_search, feature_values, Y, cv=6)

In [18]:
print ("Accuracy:", accuracy_score(Y, pred))
precision, recall, fscore, support = precision_recall_fscore_support(Y, pred, average='binary')
print ("Precision:", precision)
print ("Recall:", recall)
print ("f-score:", fscore)

Accuracy: 0.769230769231
Precision: 0.783783783784
Recall: 0.74358974359
f-score: 0.763157894737


### Random Forest

In [19]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)


In [20]:
param_dist = {"max_depth": [3, None],
              "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
              "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
print (param_dist)
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(feature_values, Y)

{'max_depth': [3, None], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'max_depth': [3, None], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [21]:
pred = cross_val_predict(random_search, feature_values, Y, cv=6)

In [22]:
print ("Accuracy:", accuracy_score(Y, pred))
precision, recall, fscore, support = precision_recall_fscore_support(Y, pred, average='binary')
print ("Precision:", precision)
print ("Recall:", recall)
print ("f-score:", fscore)

Accuracy: 0.730769230769
Precision: 0.757142857143
Recall: 0.679487179487
f-score: 0.716216216216
