# Machine Learning (One vs All)

## Modules and functions

Import all the modules:

In [1]:
import pandas as pd

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use("ggplot")

from notebooks import *

## Loading

In [2]:
honey_badger_labels = load_dictionary("honey_badger_labels.pickle")

In [3]:
df_file_path = "dataset-filtered.csv"
df = pd.read_csv(df_file_path, low_memory=False)

In [4]:
print_dimensions(df)

The dataset has 158863 rows and 434 columns


## Prepare data

In [5]:
addresses, X, y_binary, y_multi, scaler, feature_names = extract_experiment_data(df)

Scaled columns:
contract_compiler_runs: [0, 330000000]
contract_num_source_code_lines: [1, 11409]
internal_transaction_count: [0, 1536860]
normal_transaction_block_span: [0, 6415588]
normal_transaction_count: [1, 10412943]
normal_transaction_block_delta_mean: [0, 3693174]
normal_transaction_block_delta_std: [0, 2248615]
normal_transaction_count_per_block_mean: [1, 49]
normal_transaction_count_per_block_std: [0, 74]
normal_transaction_gas_mean: [23112, 8003887]
normal_transaction_gas_std: [0, 3985463]
normal_transaction_gas_used_mean: [21421, 7906343]
normal_transaction_gas_used_std: [0, 3717662]
normal_transaction_time_delta_mean: [0, 57482326]
normal_transaction_time_delta_std: [0, 34437373]
normal_transaction_value_mean: [0, 204365823294587692646400]
normal_transaction_value_std: [0, 1285685747689323210211328]

Extracted values:
addresses (158863,)
features (158863, 430)
labels_binary (158863,)
labels_multi (158863,)


In [6]:
for feature_name in feature_names:
    print(feature_name)

contract_compiler_runs
contract_num_source_code_lines
internal_transaction_count
normal_transaction_block_span
normal_transaction_count
normal_transaction_block_delta_mean
normal_transaction_block_delta_std
normal_transaction_count_per_block_mean
normal_transaction_count_per_block_std
normal_transaction_gas_mean
normal_transaction_gas_std
normal_transaction_gas_used_mean
normal_transaction_gas_used_std
normal_transaction_time_delta_mean
normal_transaction_time_delta_std
normal_transaction_value_mean
normal_transaction_value_std
contract_has_library
has_internal_transactions
normal_transaction_before_creation_ratio
normal_transaction_block_ratio
normal_transaction_from_other_ratio
normal_transaction_other_sender_ratio
contract_compiler_minor_version_0
contract_compiler_minor_version_1
contract_compiler_minor_version_2
contract_compiler_minor_version_3
contract_compiler_patch_0
contract_compiler_patch_1
contract_compiler_patch_2
contract_compiler_patch_3
contract_compiler_patch_4
contrac

## Classification

In [7]:
xgb_scale_pos_weight = compute_scale_pos_weight(y_binary)

In [8]:
def create_xgb_model():
    return XGBClassifier(n_jobs=10,
                         scale_pos_weight=xgb_scale_pos_weight,
                         n_estimators=25,
                         max_depth=3)

In [9]:
num_labels = len(honey_badger_labels["index_to_name"])

for label_id, label_value in enumerate(honey_badger_labels["index_to_name"][1:], start=1):
    print("Label {:d}/{:d} = {}".format(label_id, num_labels, label_value))

    train_index = y_multi != label_id
    test_index = y_multi == label_id
    
    xgb_model = create_xgb_model()
    xgb_model.fit(X[train_index], y_binary[train_index])
    
    train_metrics = compute_metrics(y_binary[train_index], xgb_model.predict(X[train_index]))
    print_metrics("train", train_metrics)
    
    test_pred = xgb_model.predict(X[test_index])
    test_size = test_pred.shape[0]
    test_tp = test_pred.sum()
    test_fp = test_size - test_tp
    test_acc = test_tp / test_size
    print("test  Recall {:.03f} FN {:d} TP {:d}".format(test_acc, test_fp, test_tp))
    print()

Label 1/9 = Balance Disorder
train ROC AUC 0.981 TN  156456 FP  2112 FN     7 TP   268
test  Recall 0.850 FN 3 TP 17

Label 2/9 = Hidden State Update
train ROC AUC 0.984 TN  157575 FP   993 FN     4 TP   156
test  Recall 0.911 FN 12 TP 123

Label 3/9 = Hidden Transfer
train ROC AUC 0.982 TN  156654 FP  1914 FN     7 TP   275
test  Recall 0.923 FN 1 TP 12

Label 4/9 = Inheritance Disorder
train ROC AUC 0.985 TN  156804 FP  1764 FN     5 TP   247
test  Recall 0.884 FN 5 TP 38

Label 5/9 = Skip Empty String Literal
train ROC AUC 0.987 TN  156206 FP  2362 FN     3 TP   282
test  Recall 0.900 FN 1 TP 9

Label 6/9 = Straw Man Contract
train ROC AUC 0.984 TN  156479 FP  2089 FN     5 TP   259
test  Recall 0.903 FN 3 TP 28

Label 7/9 = Type Deduction Overflow
train ROC AUC 0.984 TN  156607 FP  1961 FN     6 TP   285
test  Recall 1.000 FN 0 TP 4

Label 8/9 = Uninitialised Struct
train ROC AUC 0.988 TN  156687 FP  1881 FN     3 TP   253
test  Recall 0.949 FN 2 TP 37

