In [1]:
import pickle
import yaml

import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier


In [2]:
import pandas as pd
import pickle
from category_encoders import CatBoostEncoder

class DataProcessor:
    @staticmethod
    def load_category_encoder(category_encoder_path: str):
        return pickle.load(open(category_encoder_path, "rb"))
    
    @staticmethod
    def save_category_encoder(category_encoder, category_encoder_path):
        pickle.dump(category_encoder, open(category_encoder_path, "wb"))

    @staticmethod
    def apply_category_features(df, category_columns=None, category_encoder=None):
        df[category_columns] = category_encoder.transform(df[category_columns])
        return df
    
    @staticmethod
    def process_data(df, cfg):
        category_columns = cfg.feature_config["category_columns"]
        Enc = CatBoostEncoder(cols=category_columns)
        try:
            Enc.fit_transform(df[category_columns], df[cfg.feature_config["target_column"]])
        except:
            Enc.fit_transform(df[category_columns], df[f"{cfg.feature_config['target_column']}_encoded"])
        DataProcessor.save_category_encoder(Enc, cfg.category_index_path)
        encoded_df = df.copy()
        encoded_df[category_columns] = Enc.transform(df[category_columns])
        return encoded_df

    @staticmethod
    def apply_process_data(df, cfg, Enc):
        try:
            category_columns = cfg.feature_config["category_columns"]
            encoded_df = DataProcessor.apply_category_features(df, category_columns, Enc)
            return encoded_df
        except:
            return df

In [3]:
class cfg2:
#     data_path = "/kaggle/input/mlops-comp/phase-2/phase-2/prob-2/raw_train.parquet"
    data_path = "./prob2/data/cleaned_combined_data.csv"
    model_path = "./prob_2_model.pkl"
    onnx_path = "./prob_2_onnx_model.onnx"
    category_index_path = "./prob_2_category_index_path.pkl"
    feature_config = {
        "numeric_columns": [
            "feature1",
            "feature5",
            "feature6",
            "feature7",
            "feature8",
            "feature9",
            "feature10",
            "feature11",
            "feature12",
            "feature13",
            "feature14",
            "feature15",
            "feature16",
            "feature17",
            "feature18",
            "feature19",
            "feature20",
            "feature21",
            "feature22",
            "feature23",
            "feature24",
            "feature25",
            "feature26",
            "feature27",
            "feature28",
            "feature29",
            "feature30",
            "feature31",
            "feature32",
            "feature33",
            "feature34",
            "feature35",
            "feature36",
            "feature37",
            "feature38",
            "feature39",
            "feature40",
            "feature41"
        ],
        "category_columns": [
            "feature2",
            "feature3",
            "feature4"
        ],
        "target_column": "label",
        "ml_type": "classification"
    }

In [4]:
try:
    df = pd.read_parquet(cfg2.data_path)
except:
    df = pd.read_csv(cfg2.data_path)
df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature33,feature34,feature35,feature36,feature37,feature38,feature39,feature40,feature41,label
0,0.041847,tcp,-,FIN,38.0,40.0,2438.0,19266.0,31.0,29.0,...,1.0,1.0,1.0,0.0,0.0,0.0,2.0,11.0,0.0,Normal
1,1.089133,tcp,http,FIN,14.0,18.0,1684.0,10168.0,31.0,29.0,...,1.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,Normal
2,2e-06,udp,dns,INT,2.0,0.0,114.0,0.0,254.0,0.0,...,17.0,17.0,25.0,0.0,0.0,0.0,17.0,25.0,0.0,Other
3,1.467246,tcp,ftp,FIN,12.0,12.0,2618.0,682.0,254.0,252.0,...,1.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,Denial of Service
4,0.000927,udp,dns,CON,2.0,2.0,130.0,162.0,31.0,29.0,...,1.0,1.0,2.0,0.0,0.0,0.0,1.0,4.0,0.0,Normal


In [5]:
df['label'].value_counts()

label
Normal                   52189
Exploits                 16068
Denial of Service        13759
Other                     6414
Information Gathering     5551
Malware                   1162
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['label'])
df['label_encoded'] = le.transform(df['label'])

In [7]:
processed_df = DataProcessor.process_data(df, cfg2)
processed_df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature34,feature35,feature36,feature37,feature38,feature39,feature40,feature41,label,label_encoded
0,0.041847,2.619633,2.819865,2.561961,38.0,40.0,2438.0,19266.0,31.0,29.0,...,1.0,1.0,0.0,0.0,0.0,2.0,11.0,0.0,Normal,4
1,1.089133,2.619633,2.123102,2.561961,14.0,18.0,1684.0,10168.0,31.0,29.0,...,1.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,Normal,4
2,2e-06,3.535369,4.36604,3.312983,2.0,0.0,114.0,0.0,254.0,0.0,...,17.0,25.0,0.0,0.0,0.0,17.0,25.0,0.0,Other,5
3,1.467246,2.619633,2.169791,2.561961,12.0,12.0,2618.0,682.0,254.0,252.0,...,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,Denial of Service,0
4,0.000927,3.535369,4.36604,3.898621,2.0,2.0,130.0,162.0,31.0,29.0,...,1.0,2.0,0.0,0.0,0.0,1.0,4.0,0.0,Normal,4


In [8]:
# pseudo_df = DataProcessor.apply_process_data(pd.read_csv(cfg2.pseudo_path), cfg2, None)

In [9]:
# test_df = pd.concat([processed_df, pseudo_df], ignore_index=True)
# sum(test_df.duplicated())

In [10]:
USING_PSEUDO = False
target_column = "label"
# feature_columns = cfg2.feature_config["category_columns"] + cfg2.feature_config["numeric_columns"]
feature_columns = cfg2.feature_config["numeric_columns"]

if not USING_PSEUDO:
    X = processed_df[feature_columns]
    y = processed_df[target_column]
else:
    X = pd.concat([processed_df[feature_columns], pseudo_df[feature_columns]], ignore_index=True)
    y = pd.concat([processed_df[target_column], pseudo_df[target_column]], ignore_index=True)
    
print(X.shape, y.shape)

(95143, 38) (95143,)


In [14]:
import optuna
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
#         "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 5, 200, step=5),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 300, step=20),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 20, step=5),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 20, step=5),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 5),
        "subsample": trial.suggest_float(
            "subsample", 0.2, 0.95, step=0.1
        ),
        "colsample_bytree": trial.suggest_float(
            "colsample_bytree", 0.2, 0.95, step=0.1
        ),
        # "is_unbalance": trial.suggest_categorical("is_unbalance", [True]),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 3, 5, step=1)
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=34)
    cv_scores = np.empty(5)
    oof_targets = []
    final_valid_predictions = []
    for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[val_idx]

        model = LGBMClassifier(objective="multiclass", **param_grid)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)],
                  eval_metric="multi_logloss", verbose=False)
#         preds = model.predict_proba(X_test)
#         cv_scores[idx] = log_loss(y_test, preds)
        preds = model.predict(X_test)
        final_valid_predictions.extend(preds)
        oof_targets.extend(y_test)

    oof_preds = np.array(final_valid_predictions)
    oof_targets = np.array(oof_targets)
    return f1_score(oof_targets, oof_preds, average='macro')
    return accuracy_score(oof_targets, oof_preds)


In [15]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {} and parameters: {}. ".format(
            frozen_trial.number,
            frozen_trial.value,
            frozen_trial.params,
            )
        )
    else:
        if (frozen_trial.number % 50 == 0):
            print(
                "Trial {} finished with best value: {}.".format(
                frozen_trial.number,
                frozen_trial.value
                )
            )

In [16]:
import warnings
warnings.filterwarnings("ignore")
# from warnings import simplefilter
# simplefilter("ignore", category=RuntimeWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# study = optuna.create_study(direction="minimize", study_name="LGBMClassifier")
study = optuna.create_study(direction="maximize", study_name="LGBMClassifier")

func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=500, callbacks=[logging_callback])

Trial 0 finished with best value: 0.7591906453231445 and parameters: {'n_estimators': 40, 'learning_rate': 0.1164982244128429, 'num_leaves': 300, 'max_depth': 4, 'min_child_samples': 120, 'reg_alpha': 0, 'reg_lambda': 0, 'min_split_gain': 0.5402002369786868, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004, 'scale_pos_weight': 4}. 
Trial 1 finished with best value: 0.7723927477082387 and parameters: {'n_estimators': 75, 'learning_rate': 0.07032658061871848, 'num_leaves': 140, 'max_depth': 10, 'min_child_samples': 260, 'reg_alpha': 10, 'reg_lambda': 5, 'min_split_gain': 4.514606213228035, 'subsample': 0.2, 'colsample_bytree': 0.5, 'scale_pos_weight': 5}. 
Trial 2 finished with best value: 0.7818613299548035 and parameters: {'n_estimators': 70, 'learning_rate': 0.06952732316490212, 'num_leaves': 260, 'max_depth': 5, 'min_child_samples': 40, 'reg_alpha': 0, 'reg_lambda': 5, 'min_split_gain': 0.37769438915288944, 'subsample': 0.7, 'colsample_bytree': 0.7, 'scale_pos_weight': 4}. 


[W 2023-08-19 03:03:35,649] Trial 82 failed with parameters: {'n_estimators': 95, 'learning_rate': 0.19336235383411646, 'num_leaves': 220, 'max_depth': 10, 'min_child_samples': 20, 'reg_alpha': 0, 'reg_lambda': 10, 'min_split_gain': 0.01424972272253866, 'subsample': 0.2, 'colsample_bytree': 0.8, 'scale_pos_weight': 4} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/luongquangdung00/miniconda3/envs/mlops/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1579/3730267134.py", line 10, in <lambda>
    func = lambda trial: objective(trial, X, y)
  File "/tmp/ipykernel_1579/1649940059.py", line 35, in objective
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)],
  File "/home/luongquangdung00/miniconda3/envs/mlops/lib/python3.10/site-packages/lightgbm/sklearn.py", line 967, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=

KeyboardInterrupt: 

In [17]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.81667
	Best params:
		n_estimators: 80
		learning_rate: 0.15419822170551373
		num_leaves: 220
		max_depth: 10
		min_child_samples: 20
		reg_alpha: 0
		reg_lambda: 10
		min_split_gain: 0.14295189465946107
		subsample: 0.30000000000000004
		colsample_bytree: 0.8
		scale_pos_weight: 4


In [23]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed