In [None]:
import csv
import os
import pandas as pd
from collections import Counter
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)


from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.utils import resample

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB
import joblib
import utilities

import community as community_louvain

# Predicting Year : 2012

### Upload Weighted Temporal Dataset

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2012/train/dataframe.csv", newline='') as csvfile:
    training_WT_df = pd.read_csv(csvfile)

training_WT_df['avg_deg'] = (training_WT_df['deg_e1']+training_WT_df['deg_e2'])/2
training_WT_df = training_WT_df.drop(['deg_e1', 'deg_e2'], axis=1)
training_WT_df = training_WT_df[['e1','e2','edge_wtf','W','CN','AA','HP','HD','JC','LHN','RA','SA','SO','avg_deg','label']]
training_WT_df = training_WT_df.rename(columns={"CN": "CN_WT", "AA": "AA_WT", "HP": "HP_WT", "HD": "HD_WT", "JC": "JC_WT", "LHN": "LHN_WT",
                  "RA": "RA_WT", "SA": "SA_WT", "SO": "SO_WT"})

X_WT_train = training_WT_df.iloc[:,2:-1]
Y_train = training_WT_df.iloc[:]['label']

X_WT_train.head()

with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2012/test/dataframe.csv", newline='') as csvfile:
    testing_WT_df = pd.read_csv(csvfile)

testing_WT_df['avg_deg'] = (testing_WT_df['deg_e1']+testing_WT_df['deg_e2'])/2
testing_WT_df = testing_WT_df.drop(['deg_e1', 'deg_e2'], axis=1)
testing_WT_df = testing_WT_df[['e1','e2','edge_wtf','W','CN','AA','HP','HD','JC','LHN','RA','SA','SO','avg_deg','label']]
testing_WT_df = testing_WT_df.rename(columns={"CN": "CN_WT", "AA": "AA_WT", "HP": "HP_WT", "HD": "HD_WT", "JC": "JC_WT", "LHN": "LHN_WT",
                  "RA": "RA_WT", "SA": "SA_WT", "SO": "SO_WT"})


X_WT_test = testing_WT_df.iloc[:,2:-1]
Y_test = testing_WT_df.iloc[:]['label']

X_WT_test.head()

In [None]:
X_WT_train.shape

### Upload Unweighted non-Temporal Dataset

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2012/train/dataframe.csv", newline='') as csvfile:
    training_non_df = pd.read_csv(csvfile)

training_non_df = training_non_df.drop(['deg_e1', 'deg_e2'], axis=1)
training_non_df = training_non_df[['e1','e2','CN','AA','HP','HD','JC','LHN','RA','SA','SO','label']]

X_non_train = training_non_df.iloc[:,2:-1]
#Y_train = training_df.iloc[:]['label']

#X_train.head()

with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2012/test/dataframe.csv", newline='') as csvfile:
    testing_non_df = pd.read_csv(csvfile)

testing_non_df = testing_non_df.drop(['deg_e1', 'deg_e2'], axis=1)
testing_non_df = testing_non_df[['e1','e2','CN','AA','HP','HD','JC','LHN','RA','SA','SO','label']]


X_non_test = testing_non_df.iloc[:,2:-1]
#Y_test = testing_df.iloc[:]['label']

X_non_test.head()

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2012/test/dataframe.csv", newline='') as csvfile:
    testing_non_df = pd.read_csv(csvfile)
testing_non_df.shape

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2012/test/dataframe.csv", newline='') as csvfile:
    testing_WT_df = pd.read_csv(csvfile)
testing_WT_df.shape

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2012/train/dataframe.csv", newline='') as csvfile:
    training_WT_df = pd.read_csv(csvfile)
training_WT_df.shape

In [None]:
with open("/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2012/train/dataframe.csv", newline='') as csvfile:
    training_non_df = pd.read_csv(csvfile)
training_non_df.shape

In [None]:
from networkx.readwrite import json_graph
G_addr = "/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2012/test/Graph.json"
g = utilities.read_json(G_addr)
G = json_graph.node_link_graph(g)

In [None]:
H_addr = "/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2012/test/Graph.json"
h = utilities.read_json(H_addr)
H = json_graph.node_link_graph(h)

In [None]:
len(G.nodes())

In [None]:
len(H.nodes())

In [None]:
len(G.edges())

In [None]:
len(H.edges())

In [None]:
disconnection_addr = "/home/c6/Desktop/OpenWPM/jsons/Prediction_new/non_temporal_unweighted/2013/test/disconnect.json"
disconnection = utilities.read_json(disconnection_addr)
len(disconnection)

In [None]:
disconnection_addr = "/home/c6/Desktop/OpenWPM/jsons/Prediction_new/WT/2013/test/disconnect.json"
disconnection = utilities.read_json(disconnection_addr)
len(disconnection)

### Merge Datasets

In [None]:
training_frames = [X_WT_train, X_non_train]
testing_frames = [X_WT_test, X_non_test]

X_train = pd.concat(training_frames,axis=1)
X_test = pd.concat(testing_frames,axis=1)

In [None]:
training_df.shape

In [None]:
X_train.shape

In [None]:
positive_samples = 0
negative_sample = 0
for i in Y_train:
    if i==1:
        positive_samples += 1
    else:
        negative_sample += 1

print("Positive Samples: ", positive_samples)
print("Negative_samples: ", negative_sample)

### upsample minority class

In [None]:
# Separate majority and minority classes
training_df_cat = [X_train, Y_train]
training_df = pd.concat(training_df_cat,axis=1)

df_majority = training_df[training_df.label==0]
df_minority = training_df[training_df.label==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=272827,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled.label.value_counts()
X_train = df_upsampled.iloc[:,0:-1]
Y_train = df_upsampled.iloc[:]['label']
# 1    576
# 0    576
# Name: balance, dtype: int64

* define evaluation function

In [None]:
def evaluate_model(predictions, actual):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(predictions)):
        if actual[i]==predictions[i]==1:
            TP += 1
        if predictions[i]==1 and actual[i]!=predictions[i]:
            FP += 1
        if actual[i]==predictions[i]==0:
            TN += 1
        if predictions[i]==0 and actual[i]!=predictions[i]:
            FN += 1
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall", "FPR", "FNR", "F1"],
        "Source": [accuracy_score(actual, predictions),
                   precision_score(actual, predictions),
                   recall_score(actual, predictions),
                   FP/(FP+TN),
                   FN/(FN+TP),
                   f1_score(actual, predictions)]
    })

* Find important features

In [None]:
def feature_importance(columns, classifier):
    display("Feature Importance")
    df = pd.DataFrame({"Features": columns,
                      "Importance": classifier.feature_importances_})
    df = df.sort_values("Importance", ascending=False)
    ax = df.plot(kind='bar', x='Features', y='Importance', legend=None)
    #ax.xaxist.set_label_text("")
    plt.tight_layout()
    plt.show

 ### choose a ML algorithm

### 1. Random Forest

In [None]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=20, criterion="entropy", class_weight={0: 1, 1: 4})

### Train and predict

In [None]:
classifier.fit(X_train, Y_train)

In [None]:
prediction = classifier.predict(X_test)

In [None]:
prediction.shape

In [None]:
type(prediction)

In [None]:
type(Y_test)

In [None]:
np.unique(prediction)

In [None]:
display(evaluate_model(prediction, Y_test))

In [None]:
columns = ['edge_wtf', 'W', 'CN', 'AA', 'HP', 'HD', 'JC', 'LHN', 'RA', 'SA', 'SO', 'avg_deg',
           'CN_WT', 'AA_WT', 'HP_WT', 'HD_WT', 'JC_WT', 'LHN_WT', 'RA_WT', 'SA_WT', 'SO_WT']
feature_importance(columns, classifier)

### build the predicted Graph

In [None]:
joblib.dump(classifier, "./random_forest.joblib")

### build the predicted Graph

In [None]:
real_graph_2012 = json_graph.node_link_graph(
            utilities.read_json("/home/c6/Desktop/OpenWPM/jsons/AST/CDX_results/Graphs/2012/Graph.json"))

concat_pred = [testing_WT_df.iloc[:,0:2],X_test, pd.Series(prediction)]
predicted_graph_2012 = pd.concat(concat_pred,axis=1)


In [None]:
predicted_graph_2012