# Load file

In [None]:
import numpy as np
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn import metrics


In [2]:
df_path = "C:/Users/Enduser/OneDrive - Asia Pacific University/uni/Y3S2/fyp/Model_trial/btc_trial_dataset2.csv"
dataset_df = pd.read_csv(df_path)

# Feature Selection: Dropping Irrelevant or Redundant Columns


In [3]:
drop_cols = ['tx_hash', 'is_malicious', 'all_malicious', 'mean_in_btc', 'mean_out_btc','in_malicious']
dataset_df.drop(columns=drop_cols, inplace=True)

In [4]:
dataset_df.head()

Unnamed: 0,indegree,outdegree,in_btc,out_btc,total_btc,out_malicious,out_and_tx_malicious
0,4,2,0.478187,0.476987,0.955174,0,0
1,3,2,2.019,2.0185,4.0375,0,0
2,1,1,0.1801,0.1801,0.3602,0,0
3,1,2,5.8798,5.8793,11.7591,0,0
4,4,2,0.495906,0.495406,0.991312,0,0


# Data Splitting

In [5]:
from sklearn.model_selection import train_test_split

# Load dataset
X = dataset_df.drop('out_and_tx_malicious', axis=1)
y = dataset_df['out_and_tx_malicious']

# Split dataset into Train (80%) and Test (20%) ensuring stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Further split Train into Train (90%) and Validation (10%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.10, stratify=y_train, random_state=42
)

print("Data Split Complete:")
print(f"Train Size: {X_train.shape[0]}")
print(f"Validation Size: {X_val.shape[0]}")
print(f"Test Size: {X_test.shape[0]}")


Data Split Complete:
Train Size: 72000
Validation Size: 8000
Test Size: 20000


# Feature Scaling: Logarithmic Transformation 

In [6]:

# Define features for log transformation
log_features = ['indegree', 'outdegree', 'in_btc', 'out_btc', 'total_btc']

# Apply log transformation
for df in [X_train, X_val, X_test]:  
    df[log_features] = np.log1p(df[log_features])  # log1p avoids log(0)

print("Log Transformation Complete")
# View before and after log transformation for a sample feature
# feature_to_view = 'in_btc'

# print("\nBefore Log Transformation:")
# print(X_train[feature_to_view].head())

# print("\nAfter Log Transformation:")
# print(np.log1p(X_train[feature_to_view].head()))

Log Transformation Complete


# Feature engineer

In [7]:
# Feature Engineering Function
def add_features(df):
    df['out_malicious_to_total_btc'] = df['out_malicious'] / (df['total_btc'] + 1e-6)
    df['log_total_btc'] = np.log1p(df['total_btc'])
    df['out_malicious_in_btc_interaction'] = df['out_malicious'] * df['in_btc']
    df['net_btc_flow'] = df['in_btc'] - df['out_btc']
    return df

# Apply feature engineering to Train, Validation, and Test sets
X_train_fe = add_features(X_train)
X_val_fe = add_features(X_val)
X_test_fe = add_features(X_test)

# Select the final set of features
selected_features = [
    'in_btc', 'out_btc', 'total_btc', 'out_malicious', 'indegree','outdegree',
    'out_malicious_to_total_btc', 'log_total_btc',
    'out_malicious_in_btc_interaction', 'net_btc_flow'
]

X_train_final = X_train_fe[selected_features]
X_val_final = X_val_fe[selected_features]
X_test_final = X_test_fe[selected_features]

print("Feature Engineering Complete")


Feature Engineering Complete


#  Data Balancing - Smote

In [8]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Apply SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.01, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_final, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())


Class distribution after SMOTE:
out_and_tx_malicious
0    71923
1      719
Name: count, dtype: int64


# Feature Scaling: Robust Scaling

In [9]:
from sklearn.preprocessing import RobustScaler

# Initialize RobustScaler
scaler = RobustScaler()

# Fit and transform Train set
X_train_scaled = scaler.fit_transform(X_train_smote)

# Transform Validation and Test sets using the same scaler
X_val_scaled = scaler.transform(X_val_final)
X_test_scaled = scaler.transform(X_test_final)

# Convert back to DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_final.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val_final.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_final.columns)

print("Scaling Complete")


Scaling Complete


-------------------------------------------

# LightGBM Model 1 

In [11]:
# Compute scale_pos_weight
scale_pos_weight = np.sum(y_train_smote == 0) / np.sum(y_train_smote == 1)
print(f"Computed scale_pos_weight: {scale_pos_weight:.2f}")

Computed scale_pos_weight: 100.03


In [None]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# Initialize LGBMClassifier
lgbm = LGBMClassifier(
    n_estimators=1000,
    boosting_type='gbdt',
    objective='binary',
    metric='auc',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Fit the model with early stopping using callbacks
lgbm.fit(
    X_train_scaled_df,
    y_train_smote,
    eval_set=[(X_val_scaled_df, y_val)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(1)]
)

[LightGBM] [Info] Number of positive: 719, number of negative: 71923
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1915
[LightGBM] [Info] Number of data points in the train set: 72642, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009898 -> initscore=-4.605490
[LightGBM] [Info] Start training from score -4.605490
[1]	valid_0's auc: 0.944549
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.978191
[3]	valid_0's auc: 0.89874
[4]	valid_0's auc: 0.899088
[5]	valid_0's auc: 0.893269
[6]	valid_0's auc: 0.893922
[7]	valid_0's auc: 0.894256
[8]	valid_0's auc: 0.894214
[9]	valid_0's auc: 0.894117
[10]	valid_0's auc: 0.894326
[11]	valid_0's auc: 0.893978
[12]	valid_0's auc: 0.894423
Early stopping, best iteration is:
[2]	valid_

 - low precision

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

# Compute predicted probabilities
y_pred_proba = lgbm.predict_proba(X_test_scaled)[:, 1]

# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Find threshold maximizing F1-score (or adjust for higher recall)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold (max F1): {optimal_threshold:.4f}")

# Apply optimal threshold
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
print("Classification Report (Optimal Threshold):")
# Print the classification report
print(classification_report(y_test, y_pred_optimal))

# Print the confusion matrix
cm_optimal = confusion_matrix(y_test, y_pred_optimal)
print("Confusion Matrix (Optimal Threshold):")
print(cm_optimal)

Optimal threshold (max F1): 0.9960
Classification Report (Optimal Threshold):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19978
           1       0.58      0.68      0.62        22

    accuracy                           1.00     20000
   macro avg       0.79      0.84      0.81     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix (Optimal Threshold):
[[19967    11]
 [    7    15]]


# Light GBM Model 2 - Grid Search

In [21]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [3, 5, 7],
    'min_data_in_leaf': [20, 50, 100]
}

In [37]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# Initialize the LightGBM model
lgbm = LGBMClassifier(n_estimators=1000, objective='binary', random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='roc_auc',  # Metric to optimize 
    cv=3,              # 3-fold cross-validation
    n_jobs=-1          # Use all CPU cores
)

# Fit to training data
grid_search.fit(X_train_scaled, y_train_smote)

[LightGBM] [Info] Number of positive: 719, number of negative: 71923
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1915
[LightGBM] [Info] Number of data points in the train set: 72642, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009898 -> initscore=-4.605490
[LightGBM] [Info] Start training from score -4.605490


In [38]:
# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Best model
best_lgbm = grid_search.best_estimator_

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 7, 'min_data_in_leaf': 20, 'num_leaves': 31}


In [39]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the LightGBM model with the best hyperparameters
lgbm_model = LGBMClassifier(
    learning_rate=0.05,
    max_depth=7,
    min_data_in_leaf=20,
    num_leaves=31,
    n_estimators=1000,  # Number of boosting rounds
    random_state=42
)

# Train the model on the training data
lgbm_model.fit(X_train_scaled, y_train_smote)

# Predict on the test set
y_pred = lgbm_model.predict(X_test_scaled)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[LightGBM] [Info] Number of positive: 719, number of negative: 71923
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1915
[LightGBM] [Info] Number of data points in the train set: 72642, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009898 -> initscore=-4.605490
[LightGBM] [Info] Start training from score -4.605490
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19978
           1       0.72      0.59      0.65        22

    accuracy                           1.00     20000
   macro avg       0.86      0.80      0.82     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix:
[[19973     5]
 [    9    13]]


# save best model

In [None]:
import joblib

joblib.dump(lgbm_model, 'lgbm_model.pkl')  
joblib.dump(scaler, 'scaler_lgbm.pkl')      

['scaler_lgbm.pkl']