# Baseline models on resampled data

In [None]:
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import boto3

from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


import scikitplot as skplt

import xgboost as xgb

from utils.evaluation import evaluate_model
from utils.report import report_by_sections
from utils.plots import plot_case

In [None]:
train_data = pd.read_csv("data/train_data_b1.csv", parse_dates=["block_time"])

In [None]:
train_data.shape

In [None]:
#features = [f for f in train_data.columns]#[f for f in train_data.columns if f.endswith("_log_to_median_ratio")]
features = [f for f in train_data.columns if f.endswith("_log_to_median_ratio") or f.endswith("_bal")]

In [None]:
len(features)


In [None]:
import os
os.getcwd()

In [None]:
#cases = pd.read_csv("Validated_V0_Desriptions.csv").rename(
#    {"File Names": "file_name", "Exploit Types": "exploit_type"}, axis=1
#)
cases = pd.read_csv("V4_2.csv").rename(
    {"File Names": "file_name", "Exploit Types": "exploit_type"}, axis=1
)
cases["file_name"] += ".csv"
cases["file_name"] = 'Balance_V4/' + cases['file_name'].astype(str)#cases["file_name"] = 'V3/' + cases['file_name'].astype(str)
cases = cases.loc[:, ["file_name", "exploit_type", "train/test splits"]]#cases = cases.loc[:, ["file_name", "exploit_type", "Minimum Date", "train/test splits"]]#
#cases = cases.loc[:, ["file_name", "exploit_type", "Minimum Time", "train/test splits", "Attack Link"]]

In [None]:
cases["exploit_type"].value_counts()

In [None]:
phishing_files = list(cases.loc[cases["exploit_type"] == "Phishing", "file_name"])

Mark transactions that we would not like the model to alert on

In [None]:
def should_not_alert(df):
    return (df["label"] == 1) & (
        (df["amount_usd_tx_sum"] < 1)  # amount under $1
        | (df["file_name"].isin(phishing_files))  # phishing
    )

In [None]:
train_data.shape

In [None]:
train_data["no_alert"] = should_not_alert(train_data)

In [None]:
train_data["no_alert"].unique()

In [None]:
model_train_data = train_data.loc[
    ~train_data["no_alert"],
    ["file_name", "transaction_id", "block_time"] + features + ["label"],
].fillna(0)

In [None]:
model_train_data.shape

In [None]:
validation_data = pd.read_csv("data/validation_data_b1.csv", parse_dates=["block_time"])

In [None]:
validation_data['file_name'].unique().shape# , validation_data["no_alert"].unique()

In [None]:
#fake_data = pd.read_csv("data/fake_data.csv", parse_dates=["block_time"])

In [None]:
validation_data["no_alert"] = should_not_alert(validation_data)

In [None]:
validation_data["no_alert"].unique()

## 1 Create train sample with balanced tx count between cases

In [None]:
tx_count = train_data.groupby("file_name")["transaction_id"].nunique()

Exclude cases with very few transactions

In [None]:
tx_count[tx_count < 20]

In [None]:
files_to_exclude = list(tx_count[tx_count < 20].index)
files_to_exclude

In [None]:
files_to_exclude += phishing_files

In [None]:
file_names = train_data["file_name"].drop_duplicates()

In [None]:
model_train_data = model_train_data.loc[:,~model_train_data.T.duplicated(keep='first')]


In [None]:
target_fer_file = 50
ref_per_file = 300
samples = []
np.random.seed(1234)
for f in file_names:
    if f not in files_to_exclude:
        df = model_train_data.loc[model_train_data["file_name"] == f, :]
        samples.append(
            df.loc[df["label"] == 1, :].sample(target_fer_file, replace=True)
        )
        samples.append(df.loc[df["label"] == 0, :].sample(ref_per_file, replace=True))
train_sample = pd.concat(samples).reset_index(drop=True)

In [None]:
train_sample.shape

In [None]:
train_sample["label"].value_counts()

## 2 Simple feature selection

In [None]:
mu_score = mutual_info_classif(
    train_sample.loc[:, features].fillna(0), train_sample["label"], random_state=1234
)

In [None]:
mu_score_df = pd.DataFrame(
    zip(features, mu_score), columns=["feature", "score"]
).sort_values("score", ascending=False)

In [None]:
mu_score_df.to_csv("mu_score_resampled_no_phishing.csv", index=False)

In [None]:
train_sample.shape

In [None]:
selected = [
    "snd_rcv_amt_usd_sum_tx_mean_log_to_median_ratio",
    "snd_rcv_mean_amt_usd_tx_median_log_to_median_ratio",
    "amount_usd_tx_sum_log_to_median_ratio",
    "snd_rcv_life_time_sec_tx_min_log_to_median_ratio",
    "snd_rcv_mean_time_diff_sec_tx_sum_log_to_median_ratio",
    "snd_rcv_tx_cnt_tx_sum_log_to_median_ratio",
    "snd_rcv_time_diff_sec_tx_mean_log_to_median_ratio",
    "snd_rcv_tkn_type_cnt_tx_mean_log_to_median_ratio",
    "amount_usd_bal"
    
    
]



In [None]:
'''    
    'snd_rcv_amt_usd_sum_tx_min_bal', 'snd_rcv_amt_usd_sum_tx_max_bal', 
    'snd_rcv_amt_usd_sum_tx_median_bal', 'snd_rcv_amt_usd_sum_tx_mean_bal', 
    'snd_rcv_amt_usd_sum_tx_std_bal', 'snd_rcv_amt_usd_sum_tx_sum_bal', 
    'snd_rcv_mean_amt_usd_tx_min_bal', 'snd_rcv_mean_amt_usd_tx_max_bal',
     'snd_rcv_mean_amt_usd_tx_median_bal', 'snd_rcv_mean_amt_usd_tx_mean_bal', 
     'snd_rcv_mean_amt_usd_tx_std_bal', 'snd_rcv_mean_amt_usd_tx_sum_bal', 
     'amount_usd_tx_min_bal', 'amount_usd_tx_max_bal', 
     'amount_usd_tx_median_bal', 'amount_usd_tx_mean_bal', 
     'amount_usd_tx_std_bal', 'amount_usd_tx_sum_bal'
'''

## 3 Model training and evaluation

In [None]:
X = train_sample.loc[:, selected].fillna(0)
y = train_sample["label"]
# class_weight = {0: 1, 1: 3}
# model = LogisticRegression(class_weight=class_weight).fit(X, y)
# model = RandomForestClassifier(
#    max_depth=3,
#    n_estimators=100,
#    class_weight=class_weight
# ).fit(X, y)
model = GradientBoostingClassifier(n_estimators=50, max_depth=2, random_state=1234).fit(
    X, y
)
# model = SVC(class_weight='balanced').fit(X, y)
# model = GaussianNB().fit(X, y)
# dtrain = xgb.DMatrix(X, y)
# param = {'max_depth': 3, 'eta': .2, 'objective': 'binary:logistic'}
# model = xgb.train(param, dtrain)
# model = DecisionTreeClassifier(max_depth=4).fit(X, y)

In [None]:
train_pred_label = model.predict(X)
# train_pred_prob = model.predict(dtrain)
# train_pred_label = (train_pred_prob > 0.5).astype(int)
train_true_label = train_sample["label"]

In [None]:
train_metrics = evaluate_model(
    train_true_label, train_pred_label, confusion_title="train"
)
print(
    f"recall: {train_metrics['recall']}, "
    + f"precision: {train_metrics['precision']}, "
    + f"specificity: {train_metrics['specificity']}"
)

In [None]:
#validation_model_data['file_name']
validation_data["no_alert"].unique()

In [None]:
#validation_data = validation_data.loc[:,~validation_data.T.duplicated(keep='first')]


In [None]:
validation_model_data = validation_data.loc[
    :, ["file_name", "transaction_id", "block_time"] + selected + ["label", "no_alert"]
].fillna(0)
clean_val_model_data = validation_model_data.loc[~validation_model_data["no_alert"], :]

In [None]:
validation_model_data["no_alert"].value_counts()

In [None]:
validation_model_data.shape

In [None]:
clean_val_model_data.shape

In [None]:
# dvalidation = xgb.DMatrix(validation_model_data.loc[:, selected], validation_model_data['label'])
# val_pred_prob = model.predict(dvalidatio)
# val_pred_label = (val_pred_prob > 0.5).astype(int)
# val_pred_label = model.predict(validation_model_data.loc[:, selected].fillna(0))
thr = 0.55
val_pred_prob = model.predict_proba(validation_model_data.loc[:, selected].fillna(0))
val_pred_label = (val_pred_prob[:, 1] > thr).astype(int)
# clean_val_pred_label = model.predict(clean_val_model_data.loc[:, selected].fillna(0))
clean_val_pred_prob = model.predict_proba(
    clean_val_model_data.loc[:, selected].fillna(0)
)
clean_val_pred_label = (clean_val_pred_prob[:, 1] > thr).astype(int)

In [None]:
# val_pred_label = model.predict(validation_model_data.loc[:, selected])
val_true_label = validation_model_data["label"]
clean_val_true_label = clean_val_model_data["label"]

In [None]:
val_metrics = evaluate_model(
    val_true_label, val_pred_label, confusion_title="validation"
)
print(
    f"recall: {val_metrics['recall']}, "
    + f"precision: {val_metrics['precision']}, "
    + f"specificity: {val_metrics['specificity']}"
)

In [None]:
clean_val_metrics = evaluate_model(
    clean_val_true_label, clean_val_pred_label, confusion_title="clean validation"
)
print(
    f"recall: {clean_val_metrics['recall']}, "
    + f"precision: {clean_val_metrics['precision']}, "
    + f"specificity: {clean_val_metrics['specificity']}"
)

In [None]:
_ = skplt.metrics.plot_precision_recall(
    clean_val_true_label,
    clean_val_pred_prob,
    title="Precision-recall on clean validation set",
)

## 4 Report

In [None]:
if type(model) is LogisticRegression:
    model_coefs = (
        pd.DataFrame(zip(selected, model.coef_[0]), columns=["feature", "coef"])
        .sort_values("coef")
        .reset_index(drop=True)
    )
    print(model_coefs)

In [None]:
if type(model) in {RandomForestClassifier, GradientBoostingClassifier}:
    feature_importance = pd.DataFrame(
        zip(selected, model.feature_importances_), columns=["feature", "importance"]
    )
    print(feature_importance.sort_values("importance", ascending=False))

In [None]:
train_pred_prob = model.predict_proba(train_data.loc[:, selected].fillna(0))[:, 1]
train_pred_label = (train_pred_prob > thr).astype(int)

In [None]:
train_results = train_data.loc[:, ["file_name", "label","sngl_mlt"]].copy()
train_results["pred_label"] = train_pred_label
train_results["exp_part"] = "train"

In [None]:
validation_results = validation_data.loc[:, ["file_name", "label", "sngl_mlt"]]
validation_results["pred_label"] = val_pred_label
validation_results["exp_part"] = "validation"

In [None]:
model_result = pd.concat([train_results, validation_results])

In [None]:
model_results = cases.merge(model_result)

In [None]:
model_results

In [None]:
#file_report, exploit_report = report_by_sections(model_results)
file_report, exploit_report, sng_mlt_report = report_by_sections(model_results)

In [None]:
file_report#exploit_report

In [None]:
exploit_report

In [None]:
sng_mlt_report

In [None]:
file_report.groupby("exp_part")["catch_any"].mean()
#exploit_report.groupby("exp_part")["recall"].mean()

In [None]:
file_report.loc[file_report["exploit_type"] != "Phishing", :].groupby("exp_part")[
    "catch_any"
].mean()

In [None]:
train_data["pred_label"] = train_pred_label
validation_data["pred_label"] = val_pred_label

In [None]:
top_features = list(feature_importance.sort_values("importance")["feature"].tail(3))

In [None]:
top_features

In [None]:
#validation_data[validation_data['file_name'] == "V3/0xhabitat001.csv"]
#validation_data['file_name'].unique()

In [None]:
fig, axs = plot_case(validation_data, 'V3/Wintermute001.csv', top_features)#plot_case(validation_data, "V3/0xhabitat001.csv", top_features)

In [None]:
model_prefix = "resampled_no_phishing"
features_rep_table = feature_importance.merge(mu_score_df)

In [None]:
features_rep_table.to_csv("reports/" + model_prefix + "/feature_table.csv", index=False)

In [None]:
file_report.to_csv("reports/" + model_prefix + "/file_table.csv", index=False)

In [None]:
exploit_report.to_csv("reports/" + model_prefix + "/exploit_table.csv", index=False)

In [None]:
sng_mlt_report.to_csv("reports/" + model_prefix + "/single_multiple_table.csv", index=False)

In [None]:
with open('artifacts/' + model_prefix + '.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
from utils.s3 import MODELS_BUCKET
from utils import MODEL_NAME
bucket = boto3.resource('s3').Bucket(MODELS_BUCKET)
bucket.upload_file('artifacts/' + model_prefix + '.pkl', MODEL_NAME + '/artifacts/' + model_prefix + '.pkl')
