# Predict ML Methods
- Environment setup
- Load selected features
- Load CIC-IDS2017 improved version, CES-CIC-IDS2018 data
   - Generate data only contains selected features
- Apply algorithms
- Retrained model with misclassified data

### Environment setup

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!wget https://downloads.rclone.org/v1.63.0/rclone-v1.63.0-linux-amd64.deb
!apt install ./rclone-v1.63.0-linux-amd64.deb
!rclone config
!sudo apt-get -y install fuse3

--2023-08-19 05:50:32--  https://downloads.rclone.org/v1.63.0/rclone-v1.63.0-linux-amd64.deb
Resolving downloads.rclone.org (downloads.rclone.org)... 95.217.6.16, 2a01:4f9:c012:7154::1
Connecting to downloads.rclone.org (downloads.rclone.org)|95.217.6.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18373062 (18M) [application/vnd.debian.binary-package]
Saving to: ‘rclone-v1.63.0-linux-amd64.deb’


2023-08-19 05:50:32 (43.4 MB/s) - ‘rclone-v1.63.0-linux-amd64.deb’ saved [18373062/18373062]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'rclone' instead of './rclone-v1.63.0-linux-amd64.deb'
The following NEW packages will be installed:
  rclone
0 upgraded, 1 newly installed, 0 to remove and 16 not upgraded.
Need to get 0 B/18.4 MB of archives.
After this operation, 56.7 MB of additional disk space will be used.
Get:1 /content/rclone-v1.63.0-linux-amd64.deb rclone amd64 1.63.0 [18.4 MB]
Selecti

In [3]:
!sudo mkdir /content/onedrive
!nohup rclone --vfs-cache-mode writes mount onedrive: /content/onedrive &

nohup: appending output to 'nohup.out'


### Load selected features

In [4]:
# Import Necessary Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score # performance metrics
import xgboost as xgb

import time
import joblib

In [14]:
feature_scores = pd.read_csv('/content/onedrive/features_scores.csv', names=['feature', 'importance ratio'], header=0)

In [15]:
feature_scores

Unnamed: 0,feature,importance ratio
0,Packet Length Std,0.077944
1,Bwd Packet Length Std,0.075532
2,Packet Length Variance,0.075460
3,Bwd Packet Length Mean,0.069707
4,Average Packet Size,0.051550
...,...,...
71,Bwd Avg Packets/Bulk,0.000000
72,Bwd Avg Bytes/Bulk,0.000000
73,Fwd Avg Bulk Rate,0.000000
74,Fwd Avg Packets/Bulk,0.000000


In [16]:
feature_extract_item = list(feature_scores[feature_scores['importance ratio'] > 0.03]['feature'])
feature_extract_item

['Packet Length Std',
 'Bwd Packet Length Std',
 'Packet Length Variance',
 'Bwd Packet Length Mean',
 'Average Packet Size',
 'Bwd Packet Length Max',
 'Avg Bwd Segment Size',
 'Packet Length Mean',
 'Max Packet Length',
 'Subflow Bwd Bytes']

In [8]:
feature_scores_resampling = pd.read_csv('/content/onedrive/feature_scores_resampling.csv', names=['feature', 'importance ratio'], header = 0)

In [9]:
feature_scores_resampling

Unnamed: 0,feature,importance ratio
0,Average Packet Size,0.068069
1,Packet Length Variance,0.066387
2,Bwd Packet Length Std,0.065651
3,Packet Length Std,0.064082
4,Bwd Packet Length Max,0.046398
...,...,...
71,Bwd PSH Flags,0.000000
72,Fwd Avg Bytes/Bulk,0.000000
73,Bwd Avg Bulk Rate,0.000000
74,Bwd Avg Packets/Bulk,0.000000


In [10]:
feature_resampling_extract_item = list(feature_scores_resampling[feature_scores_resampling['importance ratio'] > 0.03]['feature'])
feature_resampling_extract_item

['Average Packet Size',
 'Packet Length Variance',
 'Bwd Packet Length Std',
 'Packet Length Std',
 'Bwd Packet Length Max',
 'Packet Length Mean',
 'Bwd Packet Length Mean',
 'Max Packet Length',
 'Avg Bwd Segment Size',
 'Avg Fwd Segment Size',
 'Init_Win_bytes_backward']

### Load CIC-IDS2017 improved version, CES-CIC-IDS2018 data

In [7]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.7.0


In [22]:
# data2017_corrected = pd.read_parquet('/content/onedrive/data_2017_corrected.parquet', engine="fastparquet")
# data2018_original = pd.read_parquet('/content/onedrive/data_2018_original.parquet', engine="fastparquet")
data2018_corrected = pd.read_parquet('/content/onedrive/data_2018_corrected_sampling.parquet', engine="fastparquet")

In [None]:
data2018_original = pd.read_parquet('/content/onedrive/data_2018_original.parquet', engine="fastparquet")

In [None]:
# data2017_corrected_resampling = pd.read_parquet('/content/onedrive/data_2017_corrected_resampling.parquet', engine="fastparquet")
# data2018_original_resampling = pd.read_parquet('/content/onedrive/data_2018_original_resampling.parquet', engine="fastparquet")
data2018_corrected_resampling = pd.read_parquet('/content/onedrive/data_2018_corrected_sampling_resampling.parquet', engine="fastparquet")

In [None]:
df2017_corrected_y = data2017_corrected['label_encoded']
df2017_corrected_X = data2017_corrected[feature_extract_item]

In [None]:
df2017_corrected_X_r = data2017_corrected_resampling[feature_resampling_extract_item]
df2017_corrected_y_r = data2017_corrected_resampling['label_encoded']

In [None]:
# data_test_2017_corrected = pd.concat([df2017_corrected_X, df2017_corrected_y], axis=1)
data_test_2017_corrected_r = pd.concat([df2017_corrected_X_r, df2017_corrected_y_r], axis=1)

In [None]:
scaler = StandardScaler()
# X_test = scaler.fit_transform(df2017_corrected_X)
X_test_r = scaler.fit_transform(df2017_corrected_X_r)

In [None]:
del data2017_corrected_resampling

In [None]:
df2018_original_y = data2018_original['label_encoded']
df2018_original_X = data2018_original[feature_extract_item]

In [None]:
del data2018_original

In [None]:
df2018_original_X_r = data2018_original_resampling[feature_resampling_extract_item]
df2018_original_y_r = data2018_original_resampling['label_encoded']

In [None]:
del data2018_original_resampling

In [None]:
# data_test_2018_original = pd.concat([df2018_original_X, df2018_original_y], axis=1)
# data_test_2018_original_r = pd.concat([df2018_original_X_r, df2018_original_y_r], axis=1)
scaler = StandardScaler()
# X_test = scaler.fit_transform(df2018_original_X)
X_test_r = scaler.fit_transform(df2018_original_X_r)


In [None]:
df2018_corrected_y = data2018_corrected['label_encoded']
df2018_corrected_X = data2018_corrected[feature_extract_item]

In [None]:
del data2018_corrected

In [None]:
df2018_corrected_X_r = data2018_corrected_resampling[feature_resampling_extract_item]
df2018_corrected_y_r = data2018_corrected_resampling['label_encoded']

In [None]:
del data2018_corrected_resampling

In [None]:
data_test_2018_corrected = pd.concat([df2018_corrected_X, df2018_corrected_y], axis=1)
# data_test_2018_corrected_r = pd.concat([df2018_corrected_X_r, df2018_corrected_y_r], axis=1)

In [None]:
del data_test_2018_corrected

In [None]:
scaler = StandardScaler()
# X_test = scaler.fit_transform(df2018_corrected_X)
X_test_r = scaler.fit_transform(df2018_corrected_X_r)

### Apply algorithms

In [9]:
# Define constant value
METHOD_RANDOM_FOREST = 'rf'
METHOD_XGBOOST = 'xgb'
METHOD_KNN = 'knn'


In [10]:
methods = [METHOD_RANDOM_FOREST, METHOD_XGBOOST, METHOD_KNN]

In [11]:
def load_pretrained_model(method, is_resample):
    suffix_txt = ""
    if is_resample:
        suffix_txt = "_resample"
    if method == METHOD_XGBOOST:
        # ml_file_name = f"/content/onedrive/models/model_{method}{suffix_txt}.json"
        ml_file_name = f"/content/onedrive/models/model_{method}{suffix_txt}_finetune_2.json"
        ml = XGBClassifier()
        ml.load_model(ml_file_name)
    else:
        # ml_file_name = f"/content/onedrive/models/model_{method}{suffix_txt}.joblib"
        ml_file_name = f"/content/onedrive/models/model_{method}{suffix_txt}_finetune_2.joblib"
        ml = joblib.load(ml_file_name)
    ml.warm_start = True
    print(f'load_pretrained_model: {ml_file_name}')

    return ml

In [12]:
def predict_methods(df, x, y, method, is_resample):
    print(f'predict {method}')



    ml = load_pretrained_model(method, is_resample)

    ypreds = ml.predict(x)

    df[f'ypreds_{method}'] = ypreds

    print("Accuracy",accuracy_score(y, ypreds))
    print(classification_report(y, ypreds))
    print("Confusion Matrix:")
    print(confusion_matrix(y, ypreds))

    return ml

### 2017 improved

In [None]:
for method in methods:
    predict_methods(data_test_2017_corrected, X_test, df2017_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf.joblib
Accuracy 0.8660343404742256
              precision    recall  f1-score   support

           0       0.85      1.00      0.92   1594540
           1       0.99      0.45      0.62    505431

    accuracy                           0.87   2099971
   macro avg       0.92      0.72      0.77   2099971
weighted avg       0.88      0.87      0.85   2099971

Confusion Matrix:
[[1592318    2222]
 [ 279102  226329]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb.json
Accuracy 0.7507117955438433
              precision    recall  f1-score   support

           0       0.76      0.99      0.86   1594540
           1       0.00      0.00      0.00    505431

    accuracy                           0.75   2099971
   macro avg       0.38      0.49      0.43   2099971
weighted avg       0.58      0.75      0.65   2099971

Confusion Matrix:
[[1576424   18116]
 [ 505382      49]]
predict knn
load_pretraine

In [None]:
for method in methods:
    predict_methods(data_test_2017_corrected_r, X_test_r, df2017_corrected_y_r, method, True)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_resample.joblib
Accuracy 0.7495573085149111
              precision    recall  f1-score   support

           0       0.67      0.99      0.80    505431
           1       0.98      0.51      0.67    505431

    accuracy                           0.75   1010862
   macro avg       0.82      0.75      0.73   1010862
weighted avg       0.82      0.75      0.73   1010862

Confusion Matrix:
[[498952   6479]
 [246684 258747]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_resample.json
Accuracy 0.5006875320271215
              precision    recall  f1-score   support

           0       0.50      1.00      0.67    505431
           1       0.74      0.00      0.00    505431

    accuracy                           0.50   1010862
   macro avg       0.62      0.50      0.34   1010862
weighted avg       0.62      0.50      0.34   1010862

Confusion Matrix:
[[505048    383]
 [504353   1078]]
predict knn
load

In [None]:
data_test_2017_corrected.head()

Unnamed: 0,Packet Length Std,Bwd Packet Length Std,Packet Length Variance,Bwd Packet Length Mean,Average Packet Size,Bwd Packet Length Max,Avg Bwd Segment Size,Packet Length Mean,Max Packet Length,Subflow Bwd Bytes,label_encoded,ypreds_rf,ypreds_xgb,ypreds_knn
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0,0
1,174.984174,0.0,30619.460993,316.0,239.666667,316,316.0,239.666667,403,105,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0
3,55.78532,0.0,3112.001938,0.0,95.75,0,0.0,95.75,227,0,0,0,0,0
4,0.0,0.0,0.0,48.0,48.0,48,48.0,48.0,48,24,0,0,0,0


In [None]:
data_test_2017_corrected_r.head()

Unnamed: 0,Average Packet Size,Packet Length Variance,Bwd Packet Length Std,Packet Length Std,Bwd Packet Length Max,Packet Length Mean,Bwd Packet Length Mean,Max Packet Length,Avg Bwd Segment Size,Avg Fwd Segment Size,Init_Win_bytes_backward,label_encoded,ypreds_rf,ypreds_xgb,ypreds_knn
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0,0
1,239.666667,30619.460993,0.0,174.984174,316,239.666667,316.0,403,316.0,201.5,2079,0,0,0,0
2,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0
3,95.75,3112.001938,0.0,55.78532,0,95.75,0.0,227,0.0,95.75,0,0,0,0,0
4,48.0,0.0,0.0,0.0,48,48.0,48.0,48,48.0,48.0,0,0,0,0,0


In [None]:
# data_test_2017_corrected.to_parquet('/content/onedrive/test_result/data_test_tr_2017_corrected.parquet', index = False, compression=None, engine='fastparquet')
data_test_2017_corrected_r.to_parquet('/content/onedrive/test_result/data_test_tr_2017_corrected_resample.parquet', index = False, compression=None, engine='fastparquet')

### 2018 original dataset

In [None]:
for method in methods:
    predict_methods(data_test_2018_original, X_test, df2018_original_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf.joblib
Accuracy 0.8263845678046969
              precision    recall  f1-score   support

           0       0.87      0.94      0.90   9493199
           1       0.01      0.01      0.01   1325845

    accuracy                           0.83  10819044
   macro avg       0.44      0.47      0.46  10819044
weighted avg       0.77      0.83      0.79  10819044

Confusion Matrix:
[[8933033  560166]
 [1318187    7658]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb.json
Accuracy 0.8247105751672699
              precision    recall  f1-score   support

           0       0.87      0.94      0.90   9493199
           1       0.02      0.01      0.01   1325845

    accuracy                           0.82  10819044
   macro avg       0.44      0.47      0.46  10819044
weighted avg       0.77      0.82      0.79  10819044

Confusion Matrix:
[[8913183  580016]
 [1316448    9397]]
predict knn
load_pretraine

In [None]:
for method in methods:
    predict_methods(data_test_2018_original_r, X_test_r, df2018_original_y_r, method, True)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_resample.joblib
Accuracy 0.4434560600975227
              precision    recall  f1-score   support

           0       0.47      0.87      0.61   1325845
           1       0.09      0.01      0.02   1325845

    accuracy                           0.44   2651690
   macro avg       0.28      0.44      0.32   2651690
weighted avg       0.28      0.44      0.32   2651690

Confusion Matrix:
[[1158462  167383]
 [1308399   17446]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_resample.json
Accuracy 0.49875211657471274
              precision    recall  f1-score   support

           0       0.50      1.00      0.67   1325845
           1       0.10      0.00      0.00   1325845

    accuracy                           0.50   2651690
   macro avg       0.30      0.50      0.33   2651690
weighted avg       0.30      0.50      0.33   2651690

Confusion Matrix:
[[1322133    3712]
 [1325442     403]]
predict

In [None]:
# data_test_2018_original.to_parquet('/content/onedrive/test_result/data_test_tr_2018_original.parquet', index = False, compression=None, engine='fastparquet')
data_test_2018_original_r.to_parquet('/content/onedrive/test_result/data_test_tr_2018_original_resample.parquet', index = False, compression=None, engine='fastparquet')

### 2018 improved

In [None]:
for method in methods:
    predict_methods(data_test_2018_corrected, X_test, df2018_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf.joblib
Accuracy 0.6070015342129488
              precision    recall  f1-score   support

           0       0.61      1.00      0.76   5935341
           1       0.02      0.00      0.00   3841659

    accuracy                           0.61   9777000
   macro avg       0.32      0.50      0.38   9777000
weighted avg       0.38      0.61      0.46   9777000

Confusion Matrix:
[[5934636     705]
 [3841641      18]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb.json
Accuracy 0.5999435409634857
              precision    recall  f1-score   support

           0       0.60      0.99      0.75   5935341
           1       0.00      0.00      0.00   3841659

    accuracy                           0.60   9777000
   macro avg       0.30      0.49      0.38   9777000
weighted avg       0.37      0.60      0.46   9777000

Confusion Matrix:
[[5865552   69789]
 [3841563      96]]
predict knn
load_pretraine

In [None]:
for method in methods:
    predict_methods(data_test_2018_corrected_r, X_test_r, df2018_corrected_y_r, method, True)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_resample.joblib
Accuracy 0.4544643697321715
              precision    recall  f1-score   support

           0       0.48      0.91      0.62   3841654
           1       0.00      0.00      0.00   3841659

    accuracy                           0.45   7683313
   macro avg       0.24      0.45      0.31   7683313
weighted avg       0.24      0.45      0.31   7683313

Confusion Matrix:
[[3490471  351183]
 [3840338    1321]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_resample.json
Accuracy 0.4999137741752809
              precision    recall  f1-score   support

           0       0.50      1.00      0.67   3841654
           1       0.12      0.00      0.00   3841659

    accuracy                           0.50   7683313
   macro avg       0.31      0.50      0.33   7683313
weighted avg       0.31      0.50      0.33   7683313

Confusion Matrix:
[[3840888     766]
 [3841553     106]]
predict 

In [None]:
data_test_2018_corrected.to_parquet('/content/onedrive/test_result/data_test_tr_2018_corrected.parquet', index = False, compression=None, engine='fastparquet')
# data_test_2018_corrected_r.to_parquet('/content/onedrive/test_result/data_test_tr_2018_corrected_resample.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
del data_test_2018_corrected

In [None]:
data_test_2018_corrected_r.to_parquet('/content/onedrive/test_result/data_test_tr_2018_corrected_resample.parquet', index = False, compression=None, engine='fastparquet')

### Retrain the model, feed new data

In [None]:
best_param_rf = {'criterion': 'entropy', 'max_depth': 5, 'max_features': 1, 'max_leaf_nodes': 6, 'n_estimators': 3}
best_param_xgb = {'colsample_bytree': 0.8653986157607052, 'gamma': 4.049853844532946, 'max_depth': 5, 'min_child_weight': 7.0, 'n_estimators': 2, 'reg_alpha': 52.0, 'reg_lambda': 0.9937045075256274}
best_param_knn = {'n_neighbors': 5}

In [None]:
def train_rf(X_train, y_train, is_resample):
    print('~~~~START TRAIN RANDOM FOREST~~~~')
    clf = RandomForestClassifier(random_state=42)
    clf.set_params(**best_param_rf)
    start_time = time.time()
    # Train the model on the training data
    clf.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    return clf

In [None]:
def train_xgb(X_train, y_train, is_resample):
    print('~~~~START TRAIN XGBOOST~~~~')
    clf = xgb.XGBClassifier(random_state=42)
    clf.set_params(**best_param_xgb)
    start_time = time.time()
    # Train the model on the training data
    clf.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    return clf

In [None]:
def train_knn(X_train, y_train, is_resample):
    print('~~~~START TRAIN KNN~~~~')
    clf = KNeighborsClassifier()
    clf.set_params(**best_param_knn)
    start_time = time.time()
    # Train the model on the training data
    clf.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    return clf

In [None]:
def predict_methods(X, y, ml, method, is_resample):
    print(f'predict {method}')
    start_time = time.time()
    ypreds = ml.predict(X)

    # df[f'ypreds_{method}'] = ypreds

    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to predict the model:", time_taken, "seconds")
    print("Accuracy",accuracy_score(y, ypreds))
    print(classification_report(y, ypreds))
    print("Confusion Matrix:")
    print(confusion_matrix(y, ypreds))

In [None]:
def gen_misclassified_data(df, y):
  return df[df['label_encoded'] != df[y]]

In [None]:
def gen_train_x_y(df, is_resampling = False):
  if is_resampling:
    X_retrained = df.loc[:, 'Average Packet Size': 'Init_Win_bytes_backward']
  else:
    X_retrained = df.loc[:, 'Packet Length Std': 'Subflow Bwd Bytes']
  y_retrained = df['label_encoded']

  scaler = StandardScaler()
  X_retrained = scaler.fit_transform(X_retrained)
  if is_resampling:
    global X_upstream_r, y_upstream_r
    X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
    y_train = y_upstream_r.append(y_retrained)
    del X_retrained
  else:
    global X_upstream
    X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
    y_train = y_upstream.append(y_retrained)
    del X_retrained

  return X_train, y_train

In [None]:
def training_methods(mis_df, X_test, y_test, methods, is_resample):
    ml, md = None, None
    for method in methods:
      print(f'start train {method} method')
      if method == METHOD_RANDOM_FOREST:
        misclassfied_data = gen_misclassified_data(mis_df, 'ypreds_rf')
        X_train, y_train = gen_train_x_y(misclassfied_data, is_resample)
        ml = train_rf(X_train, y_train, is_resample)
      elif method == METHOD_XGBOOST:
        misclassfied_data = gen_misclassified_data(mis_df, 'ypreds_xgb')
        X_train, y_train = gen_train_x_y(misclassfied_data, is_resample)
        ml = train_xgb(X_train, y_train, is_resample)
      elif method == METHOD_KNN:
        misclassfied_data = gen_misclassified_data(mis_df, 'ypreds_knn')
        X_train, y_train = gen_train_x_y(misclassfied_data, is_resample)
        ml = train_knn(X_train, y_train, is_resample)

      predict_methods(X_test, y_test, ml, method, is_resample)

In [None]:
# upstream_model_data = pd.read_parquet('/content/onedrive/models/upstream_model_data.parquet')
upstream_model_data_resampling = pd.read_parquet('/content/onedrive/models/upstream_model_data_resampling.parquet')

In [None]:
# X_upstream, y_upstream = (upstream_model_data.loc[:, 'Packet Length Std': 'Subflow Bwd Bytes'], upstream_model_data['label_encoded'])
X_upstream_r, y_upstream_r = (upstream_model_data_resampling.loc[:, 'Average Packet Size': 'Init_Win_bytes_backward'], upstream_model_data_resampling['label_encoded'])

In [None]:
del upstream_model_data_resampling

In [None]:
# data_test_2017_corrected = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2017_corrected.parquet')
# data_test_2017_corrected_r = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2017_corrected_resample.parquet')
# data_test_2018_original = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2018_original.parquet')
# data_test_2018_original_r = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2018_original_resample.parquet')
# data_test_2018_corrected = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2018_corrected.parquet')
data_test_2018_corrected_r = pd.read_parquet('/content/onedrive/test_result/data_test_tr_2018_corrected_resample.parquet')

### Fine tune with feeding misclassified data

### 2017 improved

In [None]:
# for method in methods:
training_methods(data_test_2017_corrected, X_test, df2017_corrected_y, methods, False)

start train rf method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 5.0192365646362305 seconds
predict rf




Time taken to predict the model: 0.4791691303253174 seconds
Accuracy 0.8617004711017437
              precision    recall  f1-score   support

           0       0.85      1.00      0.92   1594540
           1       0.97      0.44      0.60    505431

    accuracy                           0.86   2099971
   macro avg       0.91      0.72      0.76   2099971
weighted avg       0.88      0.86      0.84   2099971

Confusion Matrix:
[[1587165    7375]
 [ 283050  222381]]
start train xgb method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 6.237184047698975 seconds
predict xgb
Time taken to predict the model: 0.1679680347442627 seconds
Accuracy 0.5566467346453832
              precision    recall  f1-score   support

           0       0.72      0.68      0.70   1594540
           1       0.15      0.18      0.16    505431

    accuracy                           0.56   2099971
   macro avg       0.44      0.43      0.43   2099971
weighted avg       0.58      0.56      0.57   2099971

Confusion Matrix:
[[1077093  517447]
 [ 413582   91849]]
start train knn method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 12.057262659072876 seconds
predict knn




Time taken to predict the model: 570.4969012737274 seconds
Accuracy 0.519880988832703
              precision    recall  f1-score   support

           0       0.98      0.38      0.54   1594540
           1       0.33      0.97      0.49    505431

    accuracy                           0.52   2099971
   macro avg       0.65      0.67      0.52   2099971
weighted avg       0.82      0.52      0.53   2099971

Confusion Matrix:
[[600833 993707]
 [ 14529 490902]]


### resampling

In [None]:
training_methods(data_test_2017_corrected_r, X_test_r, df2017_corrected_y_r, methods, True)

start train rf method
~~~~START TRAIN RANDOM FOREST~~~~


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


Time taken to fit the model: 1.0667822360992432 seconds
predict rf
Time taken to predict the model: 0.1733860969543457 seconds




Accuracy 0.7435416505912775
              precision    recall  f1-score   support

           0       0.67      0.97      0.79    505431
           1       0.95      0.52      0.67    505431

    accuracy                           0.74   1010862
   macro avg       0.81      0.74      0.73   1010862
weighted avg       0.81      0.74      0.73   1010862

Confusion Matrix:
[[490955  14476]
 [244768 260663]]
start train xgb method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 1.1065099239349365 seconds
predict xgb
Time taken to predict the model: 0.02514195442199707 seconds
Accuracy 0.6428731122546896
              precision    recall  f1-score   support

           0       0.96      0.30      0.46    505431
           1       0.58      0.99      0.73    505431

    accuracy                           0.64   1010862
   macro avg       0.77      0.64      0.59   1010862
weighted avg       0.77      0.64      0.59   1010862

Confusion Matrix:
[[151058 354373]
 [  6633 498798]]
start train knn method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 3.0159974098205566 seconds
predict knn




Time taken to predict the model: 195.6475613117218 seconds
Accuracy 0.7563079826919995
              precision    recall  f1-score   support

           0       0.67      0.99      0.80    505431
           1       0.98      0.52      0.68    505431

    accuracy                           0.76   1010862
   macro avg       0.83      0.76      0.74   1010862
weighted avg       0.83      0.76      0.74   1010862

Confusion Matrix:
[[501289   4142]
 [242197 263234]]


In [None]:
del data_test_2017_corrected_r, X_test_r, df2017_corrected_y_r

### 2018 original


In [None]:
training_methods(data_test_2018_original, X_test, df2018_original_y, methods, False)

start train rf method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 5.455028772354126 seconds
predict rf




Time taken to predict the model: 2.4359025955200195 seconds
Accuracy 0.6298652635112677
              precision    recall  f1-score   support

           0       0.90      0.65      0.75   9493199
           1       0.16      0.50      0.25   1325845

    accuracy                           0.63  10819044
   macro avg       0.53      0.57      0.50  10819044
weighted avg       0.81      0.63      0.69  10819044

Confusion Matrix:
[[6156470 3336729]
 [ 667775  658070]]
start train xgb method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 3.3151843547821045 seconds
predict xgb
Time taken to predict the model: 0.24299359321594238 seconds
Accuracy 0.8608772642019018
              precision    recall  f1-score   support

           0       0.88      0.98      0.93   9493199
           1       0.01      0.00      0.00   1325845

    accuracy                           0.86  10819044
   macro avg       0.44      0.49      0.46  10819044
weighted avg       0.77      0.86      0.81  10819044

Confusion Matrix:
[[9311653  181546]
 [1323629    2216]]
start train knn method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 13.182660102844238 seconds
predict knn




Time taken to predict the model: 4580.3117780685425 seconds
Accuracy 0.4266596013474019
              precision    recall  f1-score   support

           0       0.86      0.41      0.56   9493199
           1       0.11      0.52      0.18   1325845

    accuracy                           0.43  10819044
   macro avg       0.49      0.47      0.37  10819044
weighted avg       0.77      0.43      0.51  10819044

Confusion Matrix:
[[3921803 5571396]
 [ 631599  694246]]


In [None]:
del data_test_2018_original, X_test, df2018_original_y

### resampling

In [None]:
training_methods(data_test_2018_original_r, X_test_r, df2018_original_y_r, methods, True)

start train rf method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 2.7362477779388428 seconds
predict rf




Time taken to predict the model: 0.5883314609527588 seconds
Accuracy 0.4873329838706636
              precision    recall  f1-score   support

           0       0.48      0.38      0.42   1325845
           1       0.49      0.60      0.54   1325845

    accuracy                           0.49   2651690
   macro avg       0.49      0.49      0.48   2651690
weighted avg       0.49      0.49      0.48   2651690

Confusion Matrix:
[[500378 825467]
 [533967 791878]]
start train xgb method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 2.6302073001861572 seconds
predict xgb
Time taken to predict the model: 0.10053706169128418 seconds
Accuracy 0.34444524058242104
              precision    recall  f1-score   support

           0       0.40      0.64      0.49   1325845
           1       0.12      0.05      0.07   1325845

    accuracy                           0.34   2651690
   macro avg       0.26      0.34      0.28   2651690
weighted avg       0.26      0.34      0.28   2651690

Confusion Matrix:
[[ 849452  476393]
 [1261935   63910]]
start train knn method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 5.904505014419556 seconds
predict knn




Time taken to predict the model: 572.9289026260376 seconds
Accuracy 0.422799422255241
              precision    recall  f1-score   support

           0       0.45      0.66      0.53   1325845
           1       0.36      0.19      0.25   1325845

    accuracy                           0.42   2651690
   macro avg       0.40      0.42      0.39   2651690
weighted avg       0.40      0.42      0.39   2651690

Confusion Matrix:
[[ 870127  455718]
 [1074839  251006]]


In [None]:
del data_test_2018_original_r, X_test_r, df2018_original_y_r

### 2018 Improved

In [None]:
training_methods(data_test_2018_corrected, X_test, df2018_corrected_y, methods, False)

start train rf method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 8.053287029266357 seconds
predict rf




Time taken to predict the model: 2.2523415088653564 seconds
Accuracy 0.4171290784494221
              precision    recall  f1-score   support

           0       0.53      0.40      0.45   5935341
           1       0.32      0.45      0.38   3841659

    accuracy                           0.42   9777000
   macro avg       0.43      0.42      0.41   9777000
weighted avg       0.45      0.42      0.42   9777000

Confusion Matrix:
[[2362260 3573081]
 [2125648 1716011]]
start train xgb method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 3.3041605949401855 seconds
predict xgb
Time taken to predict the model: 0.2630033493041992 seconds
Accuracy 0.5519054924823565
              precision    recall  f1-score   support

           0       0.58      0.91      0.71   5935341
           1       0.02      0.00      0.00   3841659

    accuracy                           0.55   9777000
   macro avg       0.30      0.45      0.36   9777000
weighted avg       0.36      0.55      0.43   9777000

Confusion Matrix:
[[5386985  548356]
 [3832664    8995]]
start train knn method


  X_train = X_upstream.append(pd.DataFrame(X_retrained, columns = feature_extract_item[0:10]))
  y_train = y_upstream.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 23.97339391708374 seconds
predict knn




Time taken to predict the model: 2737.6262786388397 seconds
Accuracy 0.4478408509767822
              precision    recall  f1-score   support

           0       0.53      0.70      0.60   5935341
           1       0.12      0.06      0.08   3841659

    accuracy                           0.45   9777000
   macro avg       0.33      0.38      0.34   9777000
weighted avg       0.37      0.45      0.40   9777000

Confusion Matrix:
[[4131961 1803380]
 [3595080  246579]]


In [None]:
del data_test_2018_corrected, X_test, df2018_corrected_y

### resampling


In [None]:
training_methods(data_test_2018_corrected_r, X_test_r, df2018_corrected_y_r, methods, True)

start train rf method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 5.096403121948242 seconds
predict rf




Time taken to predict the model: 1.6913998126983643 seconds
Accuracy 0.766220509303734
              precision    recall  f1-score   support

           0       0.83      0.67      0.74   3841654
           1       0.72      0.86      0.79   3841659

    accuracy                           0.77   7683313
   macro avg       0.78      0.77      0.76   7683313
weighted avg       0.78      0.77      0.76   7683313

Confusion Matrix:
[[2590501 1251153]
 [ 545048 3296611]]
start train xgb method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 1.9737293720245361 seconds
predict xgb
Time taken to predict the model: 0.19787240028381348 seconds
Accuracy 0.7267167171245008
              precision    recall  f1-score   support

           0       0.84      0.56      0.67   3841654
           1       0.67      0.90      0.77   3841659

    accuracy                           0.73   7683313
   macro avg       0.76      0.73      0.72   7683313
weighted avg       0.76      0.73      0.72   7683313

Confusion Matrix:
[[2138845 1702809]
 [ 396912 3444747]]
start train knn method


  X_train = X_upstream_r.append(pd.DataFrame(X_retrained, columns = feature_resampling_extract_item[0:11]))
  y_train = y_upstream_r.append(y_retrained)


~~~~START TRAIN KNN~~~~
Time taken to fit the model: 7.270895957946777 seconds
predict knn




Time taken to predict the model: 8016.9063601493835 seconds
Accuracy 0.667742417886659
              precision    recall  f1-score   support

           0       0.82      0.43      0.57   3841654
           1       0.61      0.90      0.73   3841659

    accuracy                           0.67   7683313
   macro avg       0.72      0.67      0.65   7683313
weighted avg       0.72      0.67      0.65   7683313

Confusion Matrix:
[[1664321 2177333]
 [ 375506 3466153]]


### Experiment

In [18]:
common_feature_item = ['Bwd Packet Length Mean',
                       'Bwd Packet Length Max',
                       'Avg Bwd Segment Size',
                       'Max Packet Length',
                       'Total Length of Fwd Packets',
                       'Total Length of Bwd Packets',
                       'Total Backward Packets']

In [None]:
df2017_corrected_y = data2017_corrected['label_encoded']
df2017_corrected_X = data2017_corrected[common_feature_item]

In [None]:
scaler = StandardScaler()
X_test = scaler.fit_transform(df2017_corrected_X)

In [None]:
data_test_2017_corrected = pd.DataFrame()

In [None]:
for method in methods:
    predict_methods(data_test_2017_corrected, X_test, df2017_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune.joblib
Accuracy 0.8506312706223086
              precision    recall  f1-score   support

           0       0.84      1.00      0.91   1594540
           1       1.00      0.38      0.55    505431

    accuracy                           0.85   2099971
   macro avg       0.92      0.69      0.73   2099971
weighted avg       0.87      0.85      0.82   2099971

Confusion Matrix:
[[1593622     918]
 [ 312752  192679]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune.json
Accuracy 0.7811707875965906
              precision    recall  f1-score   support

           0       0.78      1.00      0.87   1594540
           1       0.94      0.10      0.18    505431

    accuracy                           0.78   2099971
   macro avg       0.86      0.55      0.53   2099971
weighted avg       0.82      0.78      0.71   2099971

Confusion Matrix:
[[1591146    3394]
 [ 456141   49290]]
predict 

In [None]:
del data2017_corrected, df2017_corrected_y, df2017_corrected_X, X_test

In [None]:
df2018_original_y = data2018_original['label_encoded']
df2018_original_X = data2018_original[common_feature_item]
scaler = StandardScaler()
X_test = scaler.fit_transform(df2018_original_X)
data_test_2018_original = pd.DataFrame()


In [None]:
for method in methods:
    predict_methods(data_test_2018_original, X_test, df2018_original_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune.joblib
Accuracy 0.8307480771868568
              precision    recall  f1-score   support

           0       0.87      0.95      0.91   9493199
           1       0.01      0.01      0.01   1325845

    accuracy                           0.83  10819044
   macro avg       0.44      0.48      0.46  10819044
weighted avg       0.77      0.83      0.80  10819044

Confusion Matrix:
[[8981104  512095]
 [1319049    6796]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune.json
Accuracy 0.8698202909610129
              precision    recall  f1-score   support

           0       0.88      0.99      0.93   9493199
           1       0.02      0.00      0.00   1325845

    accuracy                           0.87  10819044
   macro avg       0.45      0.50      0.47  10819044
weighted avg       0.77      0.87      0.82  10819044

Confusion Matrix:
[[9409209   83990]
 [1324430    1415]]
predict 

In [None]:
del data2018_original, df2018_original_y, df2018_original_X, X_test


In [19]:
df2018_corrected_y = data2018_corrected['label_encoded']
df2018_corrected_X = data2018_corrected[common_feature_item]
scaler = StandardScaler()
X_test = scaler.fit_transform(df2018_corrected_X)
data_test_2018_corrected = pd.DataFrame()


In [20]:
for method in methods:
    predict_methods(data_test_2018_corrected, X_test, df2018_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune.joblib
Accuracy 0.605633323105247
              precision    recall  f1-score   support

           0       0.61      1.00      0.75   5935341
           1       0.00      0.00      0.00   3841659

    accuracy                           0.61   9777000
   macro avg       0.30      0.50      0.38   9777000
weighted avg       0.37      0.61      0.46   9777000

Confusion Matrix:
[[5921231   14110]
 [3841613      46]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune.json
Accuracy 0.6027372404623095
              precision    recall  f1-score   support

           0       0.61      0.99      0.75   5935341
           1       0.00      0.00      0.00   3841659

    accuracy                           0.60   9777000
   macro avg       0.30      0.50      0.38   9777000
weighted avg       0.37      0.60      0.46   9777000

Confusion Matrix:
[[5892945   42396]
 [3841642      17]]
predict k

In [17]:
feature_extract_item.append('Total Length of Fwd Packets')
feature_extract_item.append('Total Length of Bwd Packets')
feature_extract_item.append('Total Backward Packets')



In [18]:
feature_extract_item

['Packet Length Std',
 'Bwd Packet Length Std',
 'Packet Length Variance',
 'Bwd Packet Length Mean',
 'Average Packet Size',
 'Bwd Packet Length Max',
 'Avg Bwd Segment Size',
 'Packet Length Mean',
 'Max Packet Length',
 'Subflow Bwd Bytes',
 'Total Length of Fwd Packets',
 'Total Length of Bwd Packets',
 'Total Backward Packets']

In [28]:
df2017_corrected_y = data2017_corrected['label_encoded']
df2017_corrected_X = data2017_corrected[feature_extract_item]

In [29]:
scaler = StandardScaler()
X_test = scaler.fit_transform(df2017_corrected_X)

In [30]:
data_test_2017_corrected = pd.DataFrame()


In [31]:
for method in methods:
    predict_methods(data_test_2017_corrected, X_test, df2017_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune_2.joblib
Accuracy 0.8469345529057306
              precision    recall  f1-score   support

           0       0.83      1.00      0.91   1594540
           1       0.99      0.37      0.54    505431

    accuracy                           0.85   2099971
   macro avg       0.91      0.68      0.72   2099971
weighted avg       0.87      0.85      0.82   2099971

Confusion Matrix:
[[1593350    1190]
 [ 320243  185188]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune_2.json
Accuracy 0.7627895813799334
              precision    recall  f1-score   support

           0       0.76      1.00      0.86   1594540
           1       0.85      0.02      0.03    505431

    accuracy                           0.76   2099971
   macro avg       0.80      0.51      0.45   2099971
weighted avg       0.78      0.76      0.66   2099971

Confusion Matrix:
[[1592908    1632]
 [ 496503    8928]]
pred

In [32]:
del data2017_corrected, df2017_corrected_y, df2017_corrected_X, X_test

In [19]:
df2018_original_y = data2018_original['label_encoded']
df2018_original_X = data2018_original[feature_extract_item]
scaler = StandardScaler()
X_test = scaler.fit_transform(df2018_original_X)
data_test_2018_original = pd.DataFrame()

In [20]:
for method in methods:
    predict_methods(data_test_2018_original, X_test, df2018_original_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune_2.joblib
Accuracy 0.8346190291859429
              precision    recall  f1-score   support

           0       0.87      0.95      0.91   9493199
           1       0.01      0.00      0.01   1325845

    accuracy                           0.83  10819044
   macro avg       0.44      0.48      0.46  10819044
weighted avg       0.77      0.83      0.80  10819044

Confusion Matrix:
[[9023982  469217]
 [1320047    5798]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune_2.json
Accuracy 0.8391467859821995
              precision    recall  f1-score   support

           0       0.87      0.96      0.91   9493199
           1       0.02      0.01      0.01   1325845

    accuracy                           0.84  10819044
   macro avg       0.44      0.48      0.46  10819044
weighted avg       0.77      0.84      0.80  10819044

Confusion Matrix:
[[9071908  421291]
 [1318987    6858]]
pred

In [21]:
del data_test_2018_original, X_test, df2018_original_y

In [23]:
df2018_corrected_y = data2018_corrected['label_encoded']
df2018_corrected_X = data2018_corrected[feature_extract_item]
scaler = StandardScaler()
X_test = scaler.fit_transform(df2018_corrected_X)
data_test_2018_corrected = pd.DataFrame()

In [24]:
for method in methods:
    predict_methods(data_test_2018_corrected, X_test, df2018_corrected_y, method, False)

predict rf
load_pretrained_model: /content/onedrive/models/model_rf_finetune_2.joblib
Accuracy 0.5960793699498824
              precision    recall  f1-score   support

           0       0.60      0.98      0.75   5935341
           1       0.00      0.00      0.00   3841659

    accuracy                           0.60   9777000
   macro avg       0.30      0.49      0.37   9777000
weighted avg       0.37      0.60      0.45   9777000

Confusion Matrix:
[[5827343  107998]
 [3841134     525]]
predict xgb
load_pretrained_model: /content/onedrive/models/model_xgb_finetune_2.json
Accuracy 0.6067326378234632
              precision    recall  f1-score   support

           0       0.61      1.00      0.76   5935341
           1       0.15      0.00      0.00   3841659

    accuracy                           0.61   9777000
   macro avg       0.38      0.50      0.38   9777000
weighted avg       0.43      0.61      0.46   9777000

Confusion Matrix:
[[5931318    4023]
 [3840952     707]]
pred

In [25]:
del data_test_2018_corrected, X_test, df2018_corrected_y