In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import itertools
import csv
#import skopt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from time import time

from node2vec import Node2Vec

# Counting all nodes

In [2]:
nodes = dict()

with open("train-mod.txt") as file :
    end = file.seek(0, 2)
    file.seek(0)
    while file.tell() != end:
        line = file.readline().split()
        edges = list(itertools.combinations(line,2))
        for i in edges:
            if nodes.get(i) == None:
                node1 = i[0]
                node2 = i[1]
                if nodes.get((node2,node1)) == None:
                    nodes[i] = 1
                else:
                    nodes[(node2,node1)] += 1
            else:
                nodes[i] +=1

print(len(nodes))

16087


In [3]:
with open("weighted_graph.csv", "w", newline="") as a_file:

    writer = csv.writer(a_file)
    for key, value in nodes.items():
        writer.writerow([key[0], key[1], value])

    a_file.close()

In [4]:
g = nx.read_weighted_edgelist('weighted_graph.csv', delimiter=',', nodetype=int)

In [5]:
print(nx.info(g))
#print(g.get_edge_data(1655,3650))

#plt.show()

#adj_G = nx.to_numpy_matrix(G)

Name: 
Type: Graph
Number of nodes: 3816
Number of edges: 16087
Average degree:   8.4313


In [83]:
#adj_G.shape

# Saving all graph edges

In [6]:
graph_edges = list(nodes.keys())

In [7]:
with open("edges_graph_all.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in graph_edges:
        writer.writerow([edge[0], edge[1], 1])

# Positive edges

In [8]:
num_pos_edges = 1600
num_neg_edges = 16087
#edges_pos_all = list(nodes.keys())

In [9]:
initial_node_count = len(g.nodes)

df_graph_all = pd.read_csv("edges_graph_all.csv")
df_pos_temp = pd.read_csv("edges_graph_all.csv")

removable_edges_indices = []

ncc = nx.number_connected_components(g)
number_of_nodes = len(g.nodes)

## Finding positive edges that retain graph structure

In [10]:
# for each node pair we will be removing a node pair and creating a new graph,
# and check if the number of connected components and the number of nodes
# are the same as the original graph
for i in tqdm(df_pos_temp.index.values):
  
      # remove a node pair and build a new graph
   G1 = nx.from_pandas_edgelist(df_pos_temp.drop(index= i), "Source", "Target",
                                create_using=nx.Graph())
  
      # If the number of connected components remain same as the original
      # graph we won't remove the edge
   if (nx.number_connected_components(G1) == ncc) and (len(G1.nodes) == number_of_nodes):
       removable_edges_indices.append(i)
 
       # drop the edge, so that for the next iteration the next G1
       # is created without this edge
       df_pos_temp = df_pos_temp.drop(index = i)

100%|██████████| 16087/16087 [18:35<00:00, 14.43it/s]


In [11]:
rem_pos_edges = df_graph_all.loc[removable_edges_indices]
rem_pos_edges.to_csv("edges_all_rem_pos.csv", index=False)

pos_edges_selected = random.sample(removable_edges_indices, num_pos_edges)

df_pos_valid = df_graph_all.loc[pos_edges_selected]

df_pos_valid.to_csv("edges_pos_valid2k.csv", index=False)

In [None]:
df_pos_valid

# Generating all negative edges (random sampling)

In [12]:
i = 0
num_neg_edges = 16087
edges_neg = []
while i < num_neg_edges:
    edge = random.sample(g.nodes(), 2)
    #print(str(edge[0]))
    #print(str(edge[1]))
    try:
        edge_exists = g.has_edge(edge[0],edge[1])
        #print(str(edge_exists))
        if edge_exists == False:
            #print(str(i))
            edges_neg.append([edge[0],edge[1]])
            i = i+1
    except Exception as e:
        #print(e)
        pass

In [13]:
len(edges_neg)

16087

In [14]:
with open("edges_neg_all.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in edges_neg:
        writer.writerow([edge[0], edge[1], 0])

In [15]:
df_neg_all = pd.read_csv("edges_neg_all.csv")

In [None]:
df_neg_all.head()

In [17]:
neg_indices = list(np.arange(16087))

In [18]:
neg_edges_selected = random.sample(neg_indices, num_pos_edges)

In [19]:
len(neg_edges_selected)

1600

In [20]:
df_neg_valid = df_neg_all.loc[neg_edges_selected]

df_neg_valid.to_csv("edges_neg_valid2k.csv", index=False)

df_neg_train = df_neg_all.drop(index=df_neg_valid.index.values)

df_neg_train.to_csv("edges_neg_train.csv", index=False)

In [21]:
len(df_neg_valid)

1600

# Creating sub-graph

In [22]:
df_pos_train = df_graph_all.drop(index=df_pos_valid.index.values)

In [23]:
df_pos_train.to_csv("edges_pos_train.csv", index=False)

In [24]:
G_new = nx.from_pandas_edgelist(df_pos_train, "Source", "Target",
                               create_using=nx.Graph())

print(nx.info(G_new))

Name: 
Type: Graph
Number of nodes: 3816
Number of edges: 14487
Average degree:   7.5928


# Train/Test split

In [53]:
train_pos = pd.read_csv('edges_pos_train.csv')
train_neg = pd.read_csv('edges_neg_train.csv')
valid_pos = pd.read_csv('edges_pos_valid2k.csv')
valid_neg = pd.read_csv('edges_neg_valid2k.csv')

In [None]:
data = train_pos.append(train_neg)
#data.reset_index(drop=True)
data = data.append(valid_pos)
data = data.append(valid_neg)
data.reset_index(drop=True)

# Node2Vec

In [27]:
from node2vec import Node2Vec

# Generate walks
node2vec = Node2Vec(G_new, dimensions=30, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 3816/3816 [00:04<00:00, 844.55it/s] 
Generating walks (CPU: 1): 100%|██████████| 50/50 [04:48<00:00,  5.77s/it]


In [28]:
X_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['Source'], data['Target'])]

  X_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['Source'], data['Target'])]


# Node similarity features

In [102]:
def generate_features(sample_list, test = False):
    features = []
    i = 0
    for sample in sample_list:
        #print(sample)
        source = sample[0]
        target = sample[1]
        if test == False:
            label = sample[2]
        else:
            label = -1
        
        feature = []
        try:
            i = i+1
            #print(i)
            
            #p = nx.common_neighbors(g, source, target)
            #feature.append(len(p))
            
            #p = nx.simrank_similarity(g, source, target)
            #feature.append(p)
            
            #preds = nx.resource_allocation_index(g, [(source, target)])
            #for u, v, p in preds:
            #    feature.append(p)

            preds = nx.jaccard_coefficient(G_new, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(G_new, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            #preds = nx.preferential_attachment(g, [(source, target)])
            #for u, v, p in preds:
            #    feature.append(p)
            
            feature.append(label)  # append label
            
        except Exception as e:
            #print(e)
            pass
        features.append(feature)
    print("features: "+str(len(features)))
    return features

In [None]:
data

In [103]:
graph_features = generate_features(data.to_numpy())

features: 32174


In [63]:
len(graph_features)

32174

In [104]:
def write_train_to_csv(features):
    with open("data_graphmetrics.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["JC","AA","Label"])
        writer.writerows(features)
        
write_train_to_csv(graph_features)

In [105]:
data_metrics = pd.read_csv('data_graphmetrics.csv')
FEATURE_SIZE=2

X_feat = data_metrics.iloc[:,:FEATURE_SIZE].values

In [106]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_feat_scaled = sc.fit_transform(X_feat)

In [107]:
X_n2v_arr = np.array(X_n2v)
all_feats = np.concatenate((X_feat_scaled,X_n2v_arr),axis=1)
y = data['Label'].to_list()

In [80]:
len(y)

32174

# Training classifier

In [108]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(all_feats, y, 
                                                test_size = 0.1, 
                                                shuffle = False, stratify = None)

In [109]:
x_train.shape

(28956, 32)

In [85]:
import lightgbm as lgb
import optuna
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
%%time
def objective(trial):
    #train_x, test_x, train_y, test_y = train_test_split(all_feats, data['label'], 
    #                                            test_size = 0.2, 
    #                                            random_state = 35)
    dtrain = lgb.Dataset(x_train, label=y_train)
 
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'false',
        'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'max_bin': trial.suggest_int('max_bin', 25, 255),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.5)
    }
 
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(x_test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)
 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
import lightgbm as lgbm

train_data = lgbm.Dataset(x_train, y_train)
test_data = lgbm.Dataset(x_test, y_test)

# define parameters
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'false',
    'boosting': 'dart', 'lambda_l1': 9.195584693502481e-05, 'lambda_l2': 2.9179094500377866e-05, 'num_leaves': 81, 'feature_fraction': 0.4429293323344315, 'bagging_fraction': 0.5910964246323879, 'bagging_freq': 7, 'min_child_samples': 11, 'subsample': 0.19049703012014824, 'max_depth': 15, 'max_bin': 226, 'learning_rate': 0.06381081570044855
    }

# train lightGBM model
model = lgbm.train(parameters,
                   train_data,
                   valid_sets=test_data,
                   num_boost_round=1000,
                   early_stopping_rounds=20)

In [112]:
print('Saving model...')
# save model to file
model.save_model('n2v_metrics_LGBM_optuna_V1_300.txt')

Saving model...


<lightgbm.basic.Booster at 0x1373ef0b940>

# Testing on test data

In [None]:
testCols=['Id', 'source_node', 'destination_node'] 
df_test_public = pd.read_csv('test-public.csv')
ids = df_test_public['Id'].values
df_test_public.columns = testCols
df_test_public = df_test_public.drop('Id', axis = 1)
df_test_public

# Test data Feature generation

In [114]:
x_testing_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(df_test_public['source_node'], df_test_public['destination_node'])]

  x_testing_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(df_test_public['source_node'], df_test_public['destination_node'])]


In [115]:
test_features = generate_features(df_test_public.to_numpy(), test=True)

features: 2000


In [116]:
len(test_features)

2000

In [117]:
def write_test_to_csv(features):
    with open("test.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["JC","AA","Label"])
        writer.writerows(features)

In [118]:
write_test_to_csv(test_features)

In [119]:
dataset_test = pd.read_csv('test.csv')
FEATURE_SIZE=2

X_test_feat = dataset_test.iloc[:,:FEATURE_SIZE].values

In [120]:
X_test_feat_scaled = sc.fit_transform(X_test_feat)

In [121]:
X_test_n2v_arr = np.array(x_testing_n2v)
all_test_feats = np.concatenate((X_test_feat_scaled,X_test_n2v_arr),axis=1)

# Test Prediction

In [122]:
y_pred = model.predict(all_test_feats)

In [123]:
output = pd.DataFrame({'Id': ids, 'Predicted': y_pred})
output.to_csv("n2v_feats_LGBM_opt_300_Nan1.csv", index=False)