# IndabaX Benin Republic 2024

#### The objective of this challenge is to classify network activity from various websites as either cryptojacking or not, based on features related to both network-based and host-based data.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (
    train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_recall_curve, auc,
    classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# load data
train_data = pd.read_csv('Train.csv').drop('Pages Input/sec', axis=1)
test_data = pd.read_csv('Test.csv').drop('Pages Input/sec', axis=1)

In [None]:
train_data.head()

Unnamed: 0,ID,I/O Data Operations,I/O Data Bytes,Number of subprocesses,Time on processor,Disk Reading/sec,Disc Writing/sec,Bytes Sent/sent,Received Bytes (HTTP),Network packets sent,Network packets received,Pages Read/sec,Page Errors/sec,Confirmed byte radius,Label
0,ID_4W8AP96UO6,114.7989,3790.450939,28.0,0.427078,6.162365,21.220403,58.492773,63.959387,0.621206,0.521813,24.624612,1001.53343,27.190843,1
1,ID_UD3TM0ZYND,14.15424,5182.451722,28.0,0.138876,5.399498,0.0,22.886762,29.508369,0.422183,0.444403,0.04444,1089.121009,30.150941,0
2,ID_XAG1HC0HWM,0.044444,120.821253,28.0,0.0,1.577765,0.0,11.733239,12.888786,0.177776,0.155554,0.044444,322.464081,28.163287,0
3,ID_CEXD05IR09,31.369916,127228.2511,31.0,0.52375,15.981107,5.356671,602.125436,6976.986795,3.311801,5.756755,5.623394,12015.16969,34.204404,0
4,ID_X6E97FT8IF,5.046181,1091.620117,25.0,0.069468,1.356022,0.0,16.138887,13.271234,0.155609,0.133379,0.0,550.989619,31.466889,1


In [None]:
train_data.shape, test_data.shape

((8908, 15), (3818, 14))

As we can see, the Train set contains 8908 records while the Test set contains 3818 records. The two datasets contains 15 colums which are the independant variables and the last one in the Train set is the target variable.

Notice that while, I was loading the dataset, I've already drop the column named "Pages input/sec". it's because when I take a look to the datasets, I noticed that all the values except one are equal to 0.0. i draw my conclusion like this columns wont give me insight about my goal.

It's the same things with the ID column cause it just contains ID of different case which I no longer need.

In [None]:
train_data.drop('ID', axis=1, inplace=True)

Before starting this notebook, I take a look on Internet about the subject (cryptojacking) and how it can be detected by the professionnal. So in one article, they said we can check the IO efficiency, network traffic and the process complexity. 

So I decide to create this new variables based on the one that are present in the dataset. 

Notice that this variables I've created are very insightful and will really help me in the process of model training.

In [None]:
def engineer_features(df):
      # IO efficiency ratio
      df['IO_efficiency'] = df['I/O Data Bytes'] / (df['I/O Data Operations'] + 1)
      # Network traffic ratio
      df['network_traffic_ratio'] = df['Bytes Sent/sent'] / (df['Received Bytes (HTTP)'] + 1)
      # Process complexity score
      df['process_complexity'] = df['Number of subprocesses'] * df['Time on processor']

      return df

train_data = engineer_features(train_data)
test_data = engineer_features(test_data)

In [None]:
label_data = train_data['Label']
test_Id = test_data['ID']

In [None]:
columns = train_data.columns
columns

Index(['I/O Data Operations', 'I/O Data Bytes', 'Number of subprocesses',
       'Time on processor', 'Disk Reading/sec', 'Disc Writing/sec',
       'Bytes Sent/sent', 'Received Bytes (HTTP)', 'Network packets sent',
       'Network packets received', 'Pages Read/sec', 'Page Errors/sec',
       'Confirmed byte radius', 'Label', 'IO_efficiency',
       'network_traffic_ratio', 'process_complexity'],
      dtype='object')

## Standardization

In [None]:
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data.drop('Label', axis=1))
test_data = scaler.transform(test_data.drop('ID', axis=1))

In [None]:
type(columns)

pandas.core.indexes.base.Index

In [None]:
train_data = pd.DataFrame(train_data, columns=columns.drop('Label'))
train_data['Label'] = label_data
test_data = pd.DataFrame(test_data, columns=columns.drop('Label'))
test_data['ID'] = test_Id

In [None]:
train_data.head()

Unnamed: 0,I/O Data Operations,I/O Data Bytes,Number of subprocesses,Time on processor,Disk Reading/sec,Disc Writing/sec,Bytes Sent/sent,Received Bytes (HTTP),Network packets sent,Network packets received,Pages Read/sec,Page Errors/sec,Confirmed byte radius,IO_efficiency,network_traffic_ratio,process_complexity,Label
0,1.032806,-0.224169,-0.318375,-0.043505,0.041954,1.287696,-0.258324,-0.18241,-0.200721,-0.197638,1.891673,-0.355833,-0.313696,-0.181312,0.081048,-0.089678,1
1,-0.293554,-0.221326,-0.318375,-0.224362,0.002398,-0.066775,-0.276681,-0.182791,-0.210291,-0.198902,-0.075517,-0.338301,0.431936,-0.160491,-0.11006,-0.236812,0
2,-0.479502,-0.231663,-0.318375,-0.311511,-0.195767,-0.066775,-0.282431,-0.182975,-0.222043,-0.203619,-0.075517,-0.491763,-0.068743,-0.175727,0.010272,-0.307712,0
3,-0.066675,0.027921,0.208765,0.01716,0.551077,0.275135,0.021952,-0.105958,-0.071343,-0.11216,0.370975,1.84878,1.452981,0.081105,-0.954364,-0.011676,0
4,-0.413585,-0.229681,-0.845514,-0.267918,-0.207265,-0.066775,-0.28016,-0.182971,-0.223109,-0.203981,-0.079074,-0.446019,0.763416,-0.17136,0.374081,-0.276046,1


In [None]:
test_data.head()

Unnamed: 0,I/O Data Operations,I/O Data Bytes,Number of subprocesses,Time on processor,Disk Reading/sec,Disc Writing/sec,Bytes Sent/sent,Received Bytes (HTTP),Network packets sent,Network packets received,Pages Read/sec,Page Errors/sec,Confirmed byte radius,IO_efficiency,network_traffic_ratio,process_complexity,ID
0,-0.049022,0.015455,0.10463,0.000197,-0.070218,-0.066775,-0.115476,-0.174963,-0.141922,-0.174961,-0.054181,-0.410923,-2.122842,0.058398,-0.486107,-0.032322,ID_L7RNFK5JC9
1,-0.45286,-0.227765,-0.142662,-0.311511,-0.087516,-0.066775,-0.255564,-0.182663,-0.195341,-0.196728,-0.075518,-0.479151,1.153477,-0.138945,0.863632,-0.307712,ID_KUTY5K1G5F
2,1.029192,-0.223969,-0.318375,-0.006482,-0.237252,-0.05401,-0.282626,-0.182992,-0.222044,-0.203982,-0.079074,-0.455259,0.416872,-0.181249,0.106818,-0.059558,ID_H6PJWKU831
3,-0.123779,-0.220512,-0.845514,-0.113858,-0.184814,-0.066775,-0.265642,-0.182969,-0.216052,-0.204101,-0.079074,-0.463042,-1.568028,-0.170113,2.839845,-0.164142,ID_DNQXNVTXHE
4,-0.464572,-0.231549,-0.318375,-0.311511,-0.188892,-0.066775,-0.243027,-0.182482,-0.187868,-0.193464,-0.079074,-0.401846,0.654013,-0.178054,0.852201,-0.307712,ID_8CRTF1BJOD


## Train set splitting

In [None]:
X = train_data.drop(['Label'], axis=1)
y = train_data['Label']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

Splitting data...


I decided to create my own F1 score (I really don't know what came over me haha) to find out how my model performs.

In [None]:
def F1_score(y_true, y_pred):
    conf = confusion_matrix(y_true, y_pred)
    TP = conf[1, 1]
    TN = conf[0, 0]
    FP = conf[0, 1]
    FN = conf[1, 0]
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1

For Model training, I start with 3 models : RandomForest, Xgboost and LightGBM but as I long as I try all the possible combinaison, I noticed that the XGBoost performs well on the data set when he is alone. So i decided to focus on it and use RandomizedSearchCV to really try to find good parameters.

In [None]:
param_dist = {
    'n_estimators': [500, 750, 1000, 1250, 1300, 1350],
    'max_depth': [4, 5, 6, 7, 8, 9, 10, 12],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0, 1.1, 1.2],
    'min_child_weight': [1, 2, 3, 4, 5],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'scale_pos_weight': [1, 2, 3],  
    'reg_alpha': [0, 0.1, 0.5, 1], 
    'reg_lambda': [0, 0.1, 0.5, 1]
}

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    tree_method='hist')

cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=200,
    scoring='f1',
    n_jobs=-1,
    cv=cv,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)
xgb_model = random_search.best_estimator_

xgb_pred = xgb_model.predict(X_val)
 # Print results
print("\nXGBoost Results:")
print(classification_report(y_val, xgb_pred))

Fitting 7 folds for each of 200 candidates, totalling 1400 fits


434 fits failed out of a total of 1400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
224 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1531, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/training.py", line 


XGBoost Results:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       1.00      0.96      0.98       361

    accuracy                           0.99      1337
   macro avg       0.99      0.98      0.99      1337
weighted avg       0.99      0.99      0.99      1337



The model training take me 28 minutes ahhah

In [None]:
print(random_search.best_params_)

{'subsample': 0.9, 'scale_pos_weight': 3, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 1300, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}


In [None]:
print(F1_score(y_val, xgb_pred))

0.9788434414668546


In [None]:
predictions = xgb_model.predict(test_data[['I/O Data Operations', 'I/O Data Bytes', 'Number of subprocesses',
      'Time on processor', 'Disk Reading/sec', 'Disc Writing/sec',
      'Bytes Sent/sent', 'Received Bytes (HTTP)', 'Network packets sent',
     'Network packets received', 'Pages Read/sec',
      'Page Errors/sec', 'Confirmed byte radius', 'IO_efficiency', 'network_traffic_ratio', 'process_complexity']])

## Submissions saving

In [None]:
submission = pd.DataFrame({
           'ID': test_data["ID"],
           'Target': predictions })

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f'submission_xgb_{timestamp}.csv'

submission.to_csv(output_path, index=False)
print(f"Submission saved to: {output_path}")

Submission saved to: submission_xgb_20241123_130751.csv


The submission_xgb_20241123_130751.csv is the last submission I've made. it is also the model that perfoms well on the private score.