In [1]:
#load libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from tensorflow import keras
from keras.models import load_model
import seaborn as sns

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)

In [2]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [3]:
data = pd.read_csv(r'BitcoinHeistData.csv')

In [4]:
data.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [5]:
data.shape

(2916697, 10)

In [6]:
data.dtypes.value_counts()

int64      6
object     2
float64    2
dtype: int64

In [7]:
data['year'].value_counts()

2016    380631
2014    375319
2013    372494
2015    368701
2017    368486
2012    365714
2011    355349
2018    330003
Name: year, dtype: int64

In [8]:
data['year'].nunique()

8

In [9]:
(data['label'].value_counts()/data.shape[0])*100

white                          98.580140
paduaCryptoWall                 0.424796
montrealCryptoLocker            0.319368
princetonCerber                 0.316214
princetonLocky                  0.227140
montrealCryptXXX                0.082936
montrealNoobCrypt               0.016560
montrealDMALockerv3             0.012137
montrealDMALocker               0.008606
montrealSamSam                  0.002126
montrealCryptoTorLocker2015     0.001886
montrealGlobeImposter           0.001886
montrealGlobev3                 0.001166
montrealGlobe                   0.001097
montrealWannaCry                0.000960
montrealRazy                    0.000446
montrealAPT                     0.000377
paduaKeRanger                   0.000343
montrealFlyper                  0.000309
montrealXTPLocker               0.000274
montrealVenusLocker             0.000240
montrealXLockerv5.0             0.000240
montrealCryptConsole            0.000240
montrealEDA2                    0.000206
montrealJigSaw  

In [10]:
data.select_dtypes(include=['int64']).nunique()

year             8
day            365
length          73
count        11572
looped       10168
neighbors      814
dtype: int64

In [11]:
data['weight'].nunique()

784927

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


In [13]:
data.isnull().values.any()

False

In [14]:
x = data.iloc[:,:-1]
y = data['label']

x.shape, y.shape

((2916697, 9), (2916697,))

In [15]:
#Encoding the target variable to 0 and 1
unique_label = list(set(y))
d = {}

#If the label is white, it means there is no attack so the label can be assigned to zero.
for i in unique_label:
    if i != 'white':
        d[i] = 1
    else:
        d[i] = 0

#Replacing the target column with newly encoded label column
y.replace(d, inplace=True)

In [16]:
y.tail(20)

2916677    0
2916678    0
2916679    0
2916680    0
2916681    0
2916682    0
2916683    0
2916684    0
2916685    0
2916686    0
2916687    0
2916688    0
2916689    0
2916690    0
2916691    0
2916692    0
2916693    0
2916694    0
2916695    0
2916696    0
Name: label, dtype: int64

In [17]:
y.value_counts()

0    2875284
1      41413
Name: label, dtype: int64

In [18]:
x.drop(columns = ["address", "year", "day", "count"], inplace=True)
x.drop(columns = ['neighbors'],inplace=True)
x.drop(columns = ['looped'],inplace=True)

In [19]:
x = NormalizeData(x)

In [20]:
x

Unnamed: 0,length,weight,income
0,0.125000,4.287248e-06,1.401999e-06
1,0.305556,1.256030e-07,1.400998e-06
2,0.000000,5.144698e-04,3.402425e-06
3,0.500000,2.009648e-06,8.245876e-07
4,1.000000,3.747830e-05,3.402425e-06
...,...,...,...
2916692,0.000000,5.716331e-05,2.453367e-05
2916693,0.000000,5.144698e-04,2.821409e-07
2916694,0.013889,6.173637e-03,4.739912e-05
2916695,0.000000,2.572349e-04,2.962965e-06


**_____________________________________________________________________________________________________________________________**

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [23]:
print(f"x_train: {X_train.shape}, x_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

x_train: (2041687, 3), x_test: (875010, 3), y_train: (2041687,), y_test: (875010,)


In [24]:
model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(X_train.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

In [26]:
pip install keras.metrics

Collecting keras.metrics
  Downloading keras_metrics-1.1.0-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: keras.metrics
Successfully installed keras.metrics
Note: you may need to restart the kernel to use updated packages.


In [30]:
counts = np.bincount(y)
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(y)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

# ====================================================
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
#     keras.metrics.precision(name="Precision")
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_10.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    X_train,
    y_train,
    batch_size = 2048,
    epochs = 10,
    verbose = 1,
    callbacks = callbacks,
    validation_data = (X_test,y_test),
    class_weight = class_weight,
)

Number of positive samples in training data: 41413 (1.42% of total)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25a0053fc10>

In [33]:
model_path = "fraud_model_at_epoch_10.h5"

model.save(
    model_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None,
    save_traces=True,
)

In [34]:
model = load_model(model_path)

In [36]:
y_pred = model.predict(X_test)

In [38]:
model_evaluation = model.evaluate(X_test, y_test)



In [39]:
evaluation_metrics = ["Loss", "False Negatives", "False Positives", "True Negatives", "True Positives", "Precision", "Recall"]

for metrics, eval_value in zip(evaluation_metrics, model_evaluation):
    print(metrics, " : ", eval_value, "\n")

Loss  :  0.7096705436706543 

False Negatives  :  1647.0 

False Positives  :  716129.0 

True Negatives  :  146702.0 

True Positives  :  10532.0 

