<a href="https://colab.research.google.com/github/shazzad-hasan/ethereum-fraud-detection/blob/main/models/neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc, f1_score, recall_score, accuracy_score, precision_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.base import TransformerMixin, BaseEstimator
from typing import Union, Optional
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/drive/MyDrive/ethereum-fraud-detection/data/combined_kaggle_etherscan_ts_data.csv').drop(columns=['Unnamed: 0'])

feature_list = list(set(df.columns) - set(['Address', 'FLAG']))
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0, stratify=df['FLAG'])

In [4]:
print("Number of training samples: ", df_train.shape[0])
print("Number of Test samples: ", df_test.shape[0])

Number of training samples:  11136
Number of Test samples:  2784


In [5]:
%run "/content/drive/MyDrive/ethereum-fraud-detection/models/helper_data_preprocessing.py"

In [6]:
dp = DataPrep(
    features=feature_list,
    fill_strategy='median',
    upper_clip_quantile=0.95,
    lower_clip_quantile=0.05
)

In [7]:
df_train_prep = dp.fit_transform(df_train)
df_test_prep = dp.transform(df_test)

In [8]:
scaler = StandardScaler()
df_train_prep = scaler.fit_transform(df_train_prep[feature_list])
df_test_prep = scaler.transform(df_test_prep[feature_list])

In [9]:
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras import layers, models, regularizers
from sklearn.metrics import f1_score, make_scorer # Import make_scorer


class KerasBaseModel(BaseEstimator, ClassifierMixin):
    def __init__(self, optimizer='adam', dropout_rate=0.0, l2_reg=0.0, epochs=50, batch_size=32):
        self.optimizer = optimizer
        self.dropout_rate = dropout_rate
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None

    def build_model(self): # Removed the extra indent here
        model = models.Sequential()
        model.add(layers.Dense(64, activation='relu', input_shape=(df_train_prep.shape[1],),
                               kernel_regularizer=regularizers.l2(self.l2_reg)))
        model.add(layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(self.l2_reg)))
        model.add(layers.Dropout(self.dropout_rate))
        model.add(layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(self.l2_reg)))
        model.add(layers.Dropout(self.dropout_rate))
        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer=self.optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def fit(self, X, y): # Corrected the typo here: 'deffit' -> 'fit'
        self.model = self.build_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X): # Corrected the typo here: 'defpredict' -> 'predict'
        return (self.model.predict(X) > 0.5).astype("int32")

    def score(self, X, y):
        y_pred = self.predict(X) # Calculate predictions explicitly
        return f1_score(y, y_pred)  # Use f1_score directly for scoring

In [10]:
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'dropout_rate': [0.3, 0.5],
    'l2_reg': [0.001, 0.01],
    'epochs': [50, 100],
    'batch_size': [32, 64]
}

model = KerasBaseModel()
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring="f1", verbose=2,)
grid_result = grid.fit(df_train_prep, df_train['FLAG'])

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=adam; total time=  52.8s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=adam; total time=  34.4s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=adam; total time=  36.6s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=adam; total time=  34.4s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=adam; total time=  38.0s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=rmsprop; total time=  32.9s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=rmsprop; total time=  34.2s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=rmsprop; total time=  34.3s
[CV] END batch_size=32, dropout_rate=0.3, epochs=50, l2_reg=0.001, optimizer=rmsprop; total time=  32.8s
[CV] END

In [13]:
print(f"Best parameters: {grid_result.best_params_}")
print(f"Best cross-validation accuracy: {grid_result.best_score_}")

model = grid_result.best_estimator_.model

Best parameters: {'batch_size': 32, 'dropout_rate': 0.3, 'epochs': 50, 'l2_reg': 0.001, 'optimizer': 'adam'}
Best cross-validation accuracy: nan


In [14]:
best_model = grid_result.best_estimator_.model

test_loss, test_acc = best_model.evaluate(df_test_prep, df_test['FLAG'])
print(f'Test accuracy of the best model: {test_acc}')

[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9078 - loss: 0.2737
Test accuracy of the best model: 0.9116379022598267
