## Data cleaning & EDA

In [27]:
import re
import pandas as pd

In [None]:
df_benign = pd.read_csv('../Data/CSV_benign.csv')
df_malware = pd.read_csv('../Data/CSV_malware.csv')

# 'Country' column name is duplicated in malware csv, therefore I decided to rename both. While reading it, pandas reads duplicated column name with '.1' suffix
df_benign.rename(columns={'Country.1':'Country_1'}, inplace=True)
df_malware.rename(columns={'Country.1':'Country_1'}, inplace=True)

# Reindex columns
df_benign = df_benign.reindex(sorted(df_benign.columns), axis=1)
df_malware = df_malware.reindex(sorted(df_malware.columns), axis=1)
# df_malware.head(3)

In [None]:
# By digging into missmatch in dtypes, I was able to identify 'mixing' of columns data in 24 records of df_malware.
# To fix it, following steps are taken: 1. Identify incorrect rows by checking len of IP column,
#   2. Get records into new df
#   3. Rename columns
#   4. Drop incorrect rows from df_malware
#   5. Concatenate fixed data to df_malware

incorrect_rows_idx = df_malware.index[df_malware['IP'].str.len()==2]
df_incorrect_rows = df_malware.iloc[incorrect_rows_idx]

# Applies to df_malware only - rename of columns for 24 records
col_val_replace_to = {
    'Country': 'TTL',
    'TTL': 'Domain',
    'IP': 'Country',
    'Domain': 'IP',
}

df_incorrect_rows.rename(columns=col_val_replace_to, inplace=True) # Apply rename
df_malware.drop(incorrect_rows_idx, axis=0, inplace=True) # Drop from malware df incorrect rows
df_malware = pd.concat([df_malware, df_incorrect_rows], ignore_index=False) # Concatenate fixed data

In [30]:
# To mitigate missing values across similar columns like Domain, Domain_Name and Country, Country_1, following code is applied to df's
# The code also applies mapping to unify a bit entries
countries_map = {
    '-':'',
    "china":"CN",
    "Malaysia":'ID',
    "United States":"US",
    "TURKEY":'TR',
    'RUSSIA':'RU',
    'Russian Federation':'RU',
    'Belarus':'BY',
    'Korea':'KR',
}

def use_regex(input_text):
    return re.sub(r"b'(.+?).'", r"\1", input_text)

def impute_similar_cols(df):
    df["Country_1"].replace(countries_map, inplace=True)
    df["Country"].replace(countries_map, inplace=True)
    df["Country_1"].fillna(df["Country"], inplace=True)
    df["Country"].fillna(df["Country_1"], inplace=True)
    df["Domain_Name"].fillna(df["Domain"].apply(use_regex), inplace=True)
    df['Domain_Age'] = df['Domain_Age'].str.split(' ').str[0]
    return df

df_malware = impute_similar_cols(df_malware)
df_benign = impute_similar_cols(df_benign)

In [31]:
print(df_malware.shape, df_benign.shape)

(4999, 38) (494135, 38)


In [32]:
df_benign['is_threat'] = 0
df_malware['is_threat'] = 1

# Combine two dataframes
df = pd.concat([df_benign, df_malware])
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Manually select informative fields based on # of nulls it contain
percent_na = (df.isna().sum() / len(df)) * 100
columns_to_keep = percent_na[percent_na <= 30].index.tolist()
df = df[columns_to_keep]
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)

print('df shape: ', df.shape)
print(df.columns)

df shape:  (244072, 35)
Index(['1gram', '2gram', '3gram', 'ASN', 'Alexa_Rank', 'Country', 'Country_1',
       'Creation_Date_Time', 'Domain', 'Domain_Age', 'Domain_Name', 'IP',
       'Name_Server_Count', 'Page_Rank', 'Registrar', 'TTL',
       'char_distribution', 'dec_32', 'dec_8', 'entropy', 'hex_32', 'hex_8',
       'len', 'longest_word', 'numeric_percentage', 'obfuscate_at_sign',
       'oc_32', 'oc_8', 'puny_coded', 'shortened', 'sld', 'subdomain', 'tld',
       'typos', 'is_threat'],
      dtype='object')


In [33]:
# df.to_csv('./tmp/test.csv')

In [34]:
# Manual features selection based on previous experiments, numeric ones only
numerical_features = ['Alexa_Rank','ASN','Domain_Age','TTL','entropy','len','numeric_percentage','subdomain']
categorical_features = ['is_threat']
selected_cols = numerical_features + categorical_features

df = df[selected_cols]

def cols_to_num_drop(df, cols):
    for col in cols:
        df.loc[:,col] = pd.to_numeric(df[col], errors='coerce') # Should be number
        df[col] = df[col].astype('float64')
        df.dropna(subset=[col], inplace=True)

    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def cols_to_num_fill(df, cols):
    for col in cols:
        df.loc[:,col] = pd.to_numeric(df[col], errors='coerce') # Should be number
        df.loc[:,col].fillna(0, inplace = True)

    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def cols_to_cat(df, cols):
    for col in cols:
        df[col] = df[col].astype('category')
    return df

df = cols_to_num_drop(df, numerical_features)
df = cols_to_cat(df, categorical_features)
df.shape

(215120, 9)

In [35]:
print(f"{df['is_threat'].value_counts()[1]/df['is_threat'].value_counts()[0]*100:.3f}% of whole dataset domains are a threat")

1.285% of whole dataset domains are a threat


In [36]:
def train_valid_singleframe_proportional(df, ratio = 0.1):
    train = df.iloc[:int(len(df) * (1 - ratio))]
    valid = df.iloc[int(len(df) * (1 - ratio)):]
    return (train, valid)

def train_valid_singleframe_n_last(df, n_samples = 200):
    train = df.iloc[:int(len(df) - n_samples)]
    valid = df.iloc[int(len(df) - n_samples):]
    return (train, valid)

def get_validset(df, type):
    unique_groups = df['is_threat'].unique()
    ret_train = pd.DataFrame()
    ret_valid = pd.DataFrame()

    for group in unique_groups:
        selected_df = df[df['is_threat']==group]

        if type=='n_last':
            train, valid = train_valid_singleframe_n_last(selected_df)
        elif type=='proportional':
            train, valid = train_valid_singleframe_proportional(selected_df)
        else:
            raise('Only "n_last" or "proportional" is available')

        ret_train = pd.concat([ret_train, train], ignore_index=False)
        ret_valid = pd.concat([ret_valid, valid], ignore_index=False)

    ret_train.reset_index(drop=True, inplace=True)
    ret_valid.reset_index(drop=True, inplace=True)
    return (ret_train, ret_valid)


df, df_valid_n_last = get_validset(df, 'n_last')

In [37]:
X = df.drop(columns='is_threat')
y = df['is_threat']

In [38]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import optuna

label_encoder = LabelEncoder()
scaler = MinMaxScaler()
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

def objective(trial, cv=sss, cv_fold_func=np.average):
    hparams = {
        # "eval_metric":"error",
        "scale_pos_weight":99,
        'verbosity': 0,

        'n_estimators': trial.suggest_int('n_estimators', 300, 800),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
        'gamma': trial.suggest_int('gamma', 13, 20),
    }

    accuracy_scores = []
    balanced_acc_scores = []
    f1_scores = []
    roc_aucs = []

    clf = XGBClassifier(**hparams)

    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]


        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        y_prob = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)

        accuracy_scores.append(accuracy)
        balanced_acc_scores.append(balanced_acc)
        f1_scores.append(f1)
        roc_aucs.append(roc_auc)

    print(f'Trial done: balanced accuracy scores on folds: {balanced_acc_scores}')
    print(f'    Trial mean balanced accuracy score: {cv_fold_func(balanced_acc_scores)}')
    print(f'    Trial mean f1 score: {cv_fold_func(f1_scores)}')
    print(f'    Trial mean roc auc score: {cv_fold_func(roc_aucs)}')
    return cv_fold_func(balanced_acc_scores)

In [39]:
n_trials = 20
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)
trial = study.best_trial

print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
    
best_params = trial.params

[I 2023-10-11 14:18:31,784] A new study created in memory with name: no-name-0e5f042b-78c7-4d98-a67f-ab9e4d6e648a
  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 14:32:39,402] Trial 0 finished with value: 0.6954728432475406 and parameters: {'n_estimators': 591, 'max_depth': 11, 'learning_rate': 0.15849676445221828, 'subsample': 0.5521797808158214, 'colsample_bytree': 0.8202753813217626, 'gamma': 17}. Best is trial 0 with value: 0.6954728432475406.


Trial done: balanced accuracy scores on folds: [0.6897699354762037, 0.6948740566801288, 0.6960978368443377, 0.7115428748230155, 0.695838635185447, 0.7024861844491299, 0.6933337021578282, 0.6958739808662048, 0.6996380397387902, 0.7030383966789403, 0.6888745115636724, 0.6888862934572584, 0.7014847234943252, 0.7028970139559091, 0.6814877765415327, 0.7006497458184523, 0.6666420783670091, 0.7040618846521882, 0.7025553390419169, 0.693357265945, 0.6956854705688298, 0.7032269069763153, 0.6944160995990059, 0.6890630218610474, 0.6910393064460276]
    Trial mean balanced accuracy score: 0.6954728432475406
    Trial mean f1 score: 0.9796393640872956
    Trial mean roc auc score: 0.9102796406829811


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 14:35:22,431] Trial 1 finished with value: 0.8310086325422048 and parameters: {'n_estimators': 323, 'max_depth': 3, 'learning_rate': 0.2378073493693701, 'subsample': 0.5532107497589307, 'colsample_bytree': 0.8047648311289913, 'gamma': 15}. Best is trial 1 with value: 0.8310086325422048.


Trial done: balanced accuracy scores on folds: [0.8314256910848972, 0.8493474879978362, 0.8315081643399989, 0.820852209975883, 0.839000936404412, 0.8454051639527331, 0.8293198056704717, 0.8331309921174008, 0.8303637838934343, 0.8441183762706517, 0.8336242948792817, 0.8256366832842592, 0.8180189206965865, 0.8221082622833924, 0.8189332980901038, 0.8372116253480781, 0.8238622276589684, 0.8296435516159635, 0.8291620307476687, 0.8220006884723904, 0.8351882131887541, 0.8409710739144778, 0.8299544911553838, 0.8339085970940727, 0.8205192434180195]
    Trial mean balanced accuracy score: 0.8310086325422048
    Trial mean f1 score: 0.895952103722541
    Trial mean roc auc score: 0.9130349291698636


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 14:56:45,699] Trial 2 finished with value: 0.6801956409042755 and parameters: {'n_estimators': 745, 'max_depth': 13, 'learning_rate': 0.2397320287054642, 'subsample': 0.57324292356164, 'colsample_bytree': 0.8979852289119605, 'gamma': 17}. Best is trial 1 with value: 0.8310086325422048.


Trial done: balanced accuracy scores on folds: [0.672185203171071, 0.685406536799464, 0.6947004018137969, 0.6952526140436073, 0.685218026502089, 0.690830305898938, 0.6836187625118588, 0.6845479953364192, 0.6837012357669603, 0.6805718623792868, 0.6597030553011349, 0.6617264674604589, 0.6777723820120196, 0.6899953282230651, 0.670808258390245, 0.6787840880916816, 0.6678084858320168, 0.6845597772300052, 0.6984291150056245, 0.6870780289199385, 0.6762422726145764, 0.6820548442023863, 0.6711970608785809, 0.6667496521780111, 0.6759492620436565]
    Trial mean balanced accuracy score: 0.6801956409042755
    Trial mean f1 score: 0.9809958425162444
    Trial mean roc auc score: 0.9026419606412108


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:06:22,317] Trial 3 finished with value: 0.7210100053889357 and parameters: {'n_estimators': 703, 'max_depth': 6, 'learning_rate': 0.22059227653491467, 'subsample': 0.5591919207600962, 'colsample_bytree': 0.8213623114798833, 'gamma': 14}. Best is trial 1 with value: 0.8310086325422048.


Trial done: balanced accuracy scores on folds: [0.719265137684233, 0.7255459114780232, 0.7197230947653559, 0.7301920756008253, 0.7264751443025836, 0.7179824480520944, 0.7196539401725689, 0.7202399613144086, 0.7160650729350438, 0.7295338263287415, 0.7240409026364805, 0.7155820152980205, 0.7252867098191326, 0.7253927468614061, 0.706759426027125, 0.7331093749039519, 0.6900081346291367, 0.7304159315789581, 0.7348264578300416, 0.7370045713747113, 0.7189352446638266, 0.7322154877601493, 0.7082659716373964, 0.705655001567504, 0.7130755455016731]
    Trial mean balanced accuracy score: 0.7210100053889357
    Trial mean f1 score: 0.9745407333717229
    Trial mean roc auc score: 0.9074167206398472


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:20:26,253] Trial 4 finished with value: 0.7048087747445377 and parameters: {'n_estimators': 725, 'max_depth': 9, 'learning_rate': 0.17231530117241584, 'subsample': 0.5217643150379804, 'colsample_bytree': 0.8943379980124364, 'gamma': 19}. Best is trial 1 with value: 0.8310086325422048.


Trial done: balanced accuracy scores on folds: [0.6957664070552028, 0.7027187487833915, 0.7136472234687123, 0.7150579771615677, 0.7199986886240183, 0.7078833162239748, 0.6978722924696283, 0.6966838579861773, 0.7057656489159634, 0.7187528814413661, 0.6929551447943496, 0.6920730395441329, 0.7073413491190217, 0.7079186619047326, 0.6984362865930247, 0.7092940699168301, 0.6701218350248034, 0.7144822011445854, 0.7211282136395396, 0.7178589942975635, 0.7057876759344066, 0.7072470939703343, 0.7058716859582368, 0.6946604458268533, 0.7008956288150283]
    Trial mean balanced accuracy score: 0.7048087747445377
    Trial mean f1 score: 0.9783584101214042
    Trial mean roc auc score: 0.906100977440794


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:28:12,410] Trial 5 finished with value: 0.7953178345084492 and parameters: {'n_estimators': 343, 'max_depth': 11, 'learning_rate': 0.024154657170257256, 'subsample': 0.5025052094170059, 'colsample_bytree': 0.8069364288167394, 'gamma': 19}. Best is trial 1 with value: 0.8310086325422048.


Trial done: balanced accuracy scores on folds: [0.8037162141395018, 0.8166798828777326, 0.7865187475539764, 0.7908970016617594, 0.7992114327397308, 0.7973632122154672, 0.7885703338066581, 0.8015058284515313, 0.7911162473337063, 0.806692422910558, 0.7870238322094432, 0.7951881722082548, 0.7919527617783078, 0.7849061649014317, 0.7843068250972774, 0.7983882369574438, 0.7778257591125263, 0.8052478603056735, 0.7924578464337745, 0.7986945661906781, 0.810927757526581, 0.8138568387232935, 0.7882240485864801, 0.7832864106614867, 0.7883874583279546]
    Trial mean balanced accuracy score: 0.7953178345084492
    Trial mean f1 score: 0.9441624144220542
    Trial mean roc auc score: 0.9148884864728027


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:33:24,829] Trial 6 finished with value: 0.8353884234187163 and parameters: {'n_estimators': 521, 'max_depth': 4, 'learning_rate': 0.06227012530557216, 'subsample': 0.5823180676644505, 'colsample_bytree': 0.8679467785467245, 'gamma': 14}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8342600048766794, 0.8553055403586204, 0.8242904738780051, 0.8271959912875457, 0.8476140128719749, 0.8468466530201604, 0.8299012165061256, 0.8383805940943003, 0.8298274516071528, 0.8482015707825432, 0.8309702952849887, 0.8365441554636226, 0.8267923333681667, 0.8340494675608612, 0.8374867069504977, 0.8403538051418233, 0.8192037693863375, 0.8381403459163956, 0.8301983251269883, 0.8424023178570477, 0.8407441443988878, 0.8390593336160987, 0.8284402617014694, 0.8341570413718632, 0.8243447730397491]
    Trial mean balanced accuracy score: 0.8353884234187163
    Trial mean f1 score: 0.8891583398689467
    Trial mean roc auc score: 0.9177306517557257


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:49:11,297] Trial 7 finished with value: 0.6843635830889871 and parameters: {'n_estimators': 610, 'max_depth': 13, 'learning_rate': 0.20855310831581952, 'subsample': 0.5148207814900634, 'colsample_bytree': 0.8378402557445633, 'gamma': 18}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.674466280220557, 0.6825701739827104, 0.6921350225495198, 0.7037924378684404, 0.6887346656093698, 0.6916063741068812, 0.6839942463378801, 0.6788178970037108, 0.6773953614172696, 0.7006984101615247, 0.6653020160356694, 0.6690307292274971, 0.6832299600235228, 0.6848999153752686, 0.6692074576312862, 0.6897817173697895, 0.6577385526097406, 0.6965337669070173, 0.7088494314980217, 0.695957990890035, 0.6858880576677588, 0.6832402051483801, 0.6862768601560947, 0.6718184277011784, 0.6871236197255536]
    Trial mean balanced accuracy score: 0.6843635830889871
    Trial mean f1 score: 0.9802789005675637
    Trial mean roc auc score: 0.9047578434347472


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 15:55:02,323] Trial 8 finished with value: 0.7782855398258739 and parameters: {'n_estimators': 316, 'max_depth': 9, 'learning_rate': 0.06431748351272314, 'subsample': 0.5334257988677202, 'colsample_bytree': 0.8533647398626197, 'gamma': 17}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.7903852576751353, 0.7858686943817784, 0.7748710651036704, 0.7704943477646162, 0.7959739732848125, 0.7757162879044007, 0.7654624546909352, 0.7790106078022773, 0.7880555162825769, 0.7853062370271107, 0.7771741691715996, 0.7867139171825087, 0.7774098070433184, 0.7741539063636569, 0.7620959066628146, 0.7938296686521719, 0.7620369971948848, 0.7819514708925758, 0.7860659130352822, 0.7834446978405326, 0.7850793075115207, 0.783385788372603, 0.7661427309814626, 0.7640972918036952, 0.7624124810209062]
    Trial mean balanced accuracy score: 0.7782855398258739
    Trial mean f1 score: 0.9583870367777966
    Trial mean roc auc score: 0.9178011875776185


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:10:00,716] Trial 9 finished with value: 0.6966876077018749 and parameters: {'n_estimators': 570, 'max_depth': 13, 'learning_rate': 0.055946727683816165, 'subsample': 0.5509172809036579, 'colsample_bytree': 0.8940502147003433, 'gamma': 15}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.685840930093415, 0.6951096945518476, 0.7054152656458423, 0.7018146165147314, 0.7147900671465484, 0.6984864877048257, 0.6915577097638089, 0.6937916592389511, 0.693369047838586, 0.7040501027586024, 0.6884636820568932, 0.6910039607652698, 0.7067199822964243, 0.704474250927696, 0.6793701092335213, 0.7030383966789403, 0.6649014316537476, 0.718518780338376, 0.7105900782112832, 0.7090497236889826, 0.6937563135581933, 0.6971920161791012, 0.6889703034810886, 0.689699244114688, 0.6872163381055125]
    Trial mean balanced accuracy score: 0.6966876077018749
    Trial mean f1 score: 0.9798910631472943
    Trial mean roc auc score: 0.9173379300414441


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:13:44,297] Trial 10 finished with value: 0.8242960267356778 and parameters: {'n_estimators': 472, 'max_depth': 3, 'learning_rate': 0.2976380338298994, 'subsample': 0.5995056862467345, 'colsample_bytree': 0.8688268999208323, 'gamma': 13}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8251136696602921, 0.8438735177865613, 0.817620897595879, 0.8159140597946467, 0.8325475322567757, 0.833731356434041, 0.8149407729331997, 0.8231240664129974, 0.820901898831441, 0.8319615111149359, 0.818644385569127, 0.8219136049111031, 0.8243596284707921, 0.8195485178377868, 0.8209592715306422, 0.8313119701989808, 0.8089161272608429, 0.8236071240500209, 0.8288849001202778, 0.8301235357155298, 0.8346078268655861, 0.8409577552521632, 0.8176798070638087, 0.8125874677534695, 0.8135694629710453]
    Trial mean balanced accuracy score: 0.8242960267356778
    Trial mean f1 score: 0.917744411163739
    Trial mean roc auc score: 0.9126406921084785


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:17:20,744] Trial 11 finished with value: 0.8336951091822955 and parameters: {'n_estimators': 448, 'max_depth': 3, 'learning_rate': 0.10372808247672743, 'subsample': 0.5804670987582996, 'colsample_bytree': 0.8004107983710528, 'gamma': 15}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8322698893731417, 0.8472538967332395, 0.8285206859315994, 0.8261187164087969, 0.8398523062800567, 0.8411232140186092, 0.8333051592399756, 0.8283173202031813, 0.8312699651870658, 0.8411011870001659, 0.8256592225589453, 0.8315040662900559, 0.8266576099762928, 0.8353756989736434, 0.8343050834260517, 0.8408035661230604, 0.8271391308445877, 0.8327145277919502, 0.8329942197005555, 0.8393369764997326, 0.8431865821648769, 0.8385844720789613, 0.824756627059014, 0.8361148847321003, 0.8241127209617304]
    Trial mean balanced accuracy score: 0.8336951091822955
    Trial mean f1 score: 0.8757256177736812
    Trial mean roc auc score: 0.9144200318642013


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:23:14,585] Trial 12 finished with value: 0.8035355516077674 and parameters: {'n_estimators': 440, 'max_depth': 6, 'learning_rate': 0.10330623118398852, 'subsample': 0.5843288086542633, 'colsample_bytree': 0.8637894339946933, 'gamma': 15}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8049312859475819, 0.818036337408844, 0.8028638197513713, 0.8002881953622369, 0.8134977470970439, 0.8220595979403201, 0.7874556642221799, 0.797498960119827, 0.797945135307364, 0.8140750598827549, 0.8028402559641994, 0.8026046180924807, 0.8078854677001949, 0.7961722164508018, 0.7954402022797452, 0.8066365869800856, 0.7848651844020023, 0.8046633759325625, 0.8053436522230897, 0.8048150037804511, 0.8183442034108069, 0.8219535608980466, 0.7974769331013838, 0.7908795849495018, 0.7898161409893103]
    Trial mean balanced accuracy score: 0.8035355516077674
    Trial mean f1 score: 0.9477132406880316
    Trial mean roc auc score: 0.9172927760506981


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:28:36,661] Trial 13 finished with value: 0.8179907056227295 and parameters: {'n_estimators': 475, 'max_depth': 5, 'learning_rate': 0.11206961723751238, 'subsample': 0.5734388602655204, 'colsample_bytree': 0.8389152619719205, 'gamma': 13}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8158192923897163, 0.8378176244833895, 0.8168781260437221, 0.8077876267578072, 0.8257611615512759, 0.8333702157828198, 0.8109656644885531, 0.8062324168044636, 0.8097152469997152, 0.8319963445394509, 0.8222312037816806, 0.8153495534150075, 0.8200797275616398, 0.8192903406913821, 0.8060218794886453, 0.8310758200710192, 0.7983170333396853, 0.8193933041961983, 0.8116177666857226, 0.8189471290086612, 0.8310184473718181, 0.8324041005087729, 0.8122688443704064, 0.8133999061546564, 0.8020088640820267]
    Trial mean balanced accuracy score: 0.8179907056227295
    Trial mean f1 score: 0.9349749697739652
    Trial mean roc auc score: 0.9170988479450236


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:33:24,643] Trial 14 finished with value: 0.8228730198734933 and parameters: {'n_estimators': 415, 'max_depth': 5, 'learning_rate': 0.11324909517411252, 'subsample': 0.5998901424042394, 'colsample_bytree': 0.8776316839524754, 'gamma': 14}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8202754094464149, 0.8402692828617502, 0.8177807215436534, 0.8159222558945325, 0.8414003446460001, 0.8369058083710866, 0.8135013328907439, 0.8263676729428302, 0.8197201236791473, 0.8309682462600172, 0.8203076815897155, 0.8209690043992566, 0.8214387433739655, 0.8115926661298221, 0.815920719125804, 0.8323789999528723, 0.8026199857797667, 0.8368100164536705, 0.8222839661746957, 0.815612853123841, 0.8452704405608591, 0.832776510797337, 0.8152640066224487, 0.8127708554884159, 0.8026978487286824]
    Trial mean balanced accuracy score: 0.8228730198734933
    Trial mean f1 score: 0.9316476456769196
    Trial mean roc auc score: 0.9175004186530566


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:40:46,727] Trial 15 finished with value: 0.786896300895219 and parameters: {'n_estimators': 501, 'max_depth': 7, 'learning_rate': 0.07908565151719227, 'subsample': 0.5839262786845215, 'colsample_bytree': 0.8513414337697526, 'gamma': 16}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.7827833750309916, 0.8070981298549085, 0.7814095037876226, 0.7879008148972311, 0.7852206902345519, 0.7949002841997636, 0.7787411610185293, 0.785597710829302, 0.7941846622284786, 0.7901967473777602, 0.7877507238180712, 0.7863840241621025, 0.7868655450303973, 0.7876329048822118, 0.7795628200320877, 0.7948531566254198, 0.7664357415523824, 0.7954893788790605, 0.7921935222124552, 0.7900907103354868, 0.7975010091447985, 0.8011472490815246, 0.7717534736095828, 0.7762818187965257, 0.7704323647592293]
    Trial mean balanced accuracy score: 0.786896300895219
    Trial mean f1 score: 0.9574193696259381
    Trial mean roc auc score: 0.9185605217711696


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:45:54,790] Trial 16 finished with value: 0.8253888537139603 and parameters: {'n_estimators': 661, 'max_depth': 3, 'learning_rate': 0.03160309836799649, 'subsample': 0.5665227023071496, 'colsample_bytree': 0.8372976178189788, 'gamma': 14}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8249389902814745, 0.8362296301305023, 0.8241982677542892, 0.8191648379118797, 0.8313580732608388, 0.8327672901849654, 0.8277768898669569, 0.8207154375590375, 0.8142476902366009, 0.8251730913844647, 0.8231880984433557, 0.8299386112118547, 0.8125864432409837, 0.8348321950999618, 0.830714679419798, 0.823460106508318, 0.8288428951083626, 0.8250552724486053, 0.8272508027055325, 0.8290667510864955, 0.8278060884728002, 0.8253954105938689, 0.8157644809717296, 0.8285365658751283, 0.8157127430912001]
    Trial mean balanced accuracy score: 0.8253888537139603
    Trial mean f1 score: 0.8454043672331033
    Trial mean roc auc score: 0.9085350188612747


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:49:39,572] Trial 17 finished with value: 0.8211994377475479 and parameters: {'n_estimators': 385, 'max_depth': 4, 'learning_rate': 0.02019981838671424, 'subsample': 0.5852135573125655, 'colsample_bytree': 0.8759225202273649, 'gamma': 16}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.8182069187377187, 0.8258938359181783, 0.8175266424471915, 0.8154238305702232, 0.8235128689013334, 0.8306357919583967, 0.8167726012576915, 0.8196074273057166, 0.8073020078395695, 0.8243565549333349, 0.819398426758627, 0.8243847290266927, 0.8148198804598832, 0.8264419500980458, 0.8291031212797391, 0.8172833207318297, 0.82302981126431, 0.8197723738159197, 0.8224084444417123, 0.8232167847929563, 0.821806031100101, 0.8277553751047564, 0.8195029270321718, 0.8273696461538776, 0.8144546417587191]
    Trial mean balanced accuracy score: 0.8211994377475479
    Trial mean f1 score: 0.8337749897021086
    Trial mean roc auc score: 0.905051061702289


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 16:57:14,657] Trial 18 finished with value: 0.7552077403967323 and parameters: {'n_estimators': 519, 'max_depth': 7, 'learning_rate': 0.12496199103071258, 'subsample': 0.5404128337228615, 'colsample_bytree': 0.8205017231170689, 'gamma': 20}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.7473096302124633, 0.7601539432461063, 0.7595797039978527, 0.7639313207810063, 0.7661417064689767, 0.7548377479576344, 0.7590039279808702, 0.7498970364951838, 0.763190598253821, 0.758120285961925, 0.7559554910795698, 0.7507084503838848, 0.7506024133416114, 0.7583087962593, 0.7384045676864663, 0.7607563565877178, 0.7280518690181277, 0.7720234326495737, 0.7648236711560804, 0.7665438276196272, 0.7551189766349682, 0.7620359726823991, 0.7490871593752113, 0.738852279642732, 0.7467543444451958]
    Trial mean balanced accuracy score: 0.7552077403967323
    Trial mean f1 score: 0.9672187019037619
    Trial mean roc auc score: 0.9155303901138643


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.50, 0.60),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.80, 0.90),
[I 2023-10-11 17:12:20,473] Trial 19 finished with value: 0.6814969151929055 and parameters: {'n_estimators': 538, 'max_depth': 15, 'learning_rate': 0.08475073137163094, 'subsample': 0.5646961158114194, 'colsample_bytree': 0.8609705246393885, 'gamma': 13}. Best is trial 6 with value: 0.8353884234187163.


Trial done: balanced accuracy scores on folds: [0.6752218581787857, 0.6801845966596795, 0.6933721213760432, 0.6931247016107385, 0.6931718291850824, 0.6826203750945112, 0.674585635925145, 0.6813965949303025, 0.6782672215426289, 0.697466073269035, 0.665680573399148, 0.6825496837329956, 0.6930068826748792, 0.6834317889832123, 0.6806912180838748, 0.6750907205806117, 0.6637867620692693, 0.6915607833012661, 0.6950302948342031, 0.6875139589826181, 0.6683033253626263, 0.6763734102127503, 0.6816911422699509, 0.6675743847290267, 0.6757269428342523]
    Trial mean balanced accuracy score: 0.6814969151929055
    Trial mean f1 score: 0.9819383767942105
    Trial mean roc auc score: 0.9164650146682246
Number of finished trials: 20
Best trial:
  Value: 0.8353884234187163
  Params: 
    n_estimators: 521
    max_depth: 4
    learning_rate: 0.06227012530557216
    subsample: 0.5823180676644505
    colsample_bytree: 0.8679467785467245
    gamma: 14


## References

Samaneh Mahdavifar, Nasim Maleki, Arash Habibi Lashkari, Matt Broda, Amir H. Razavi, “Classifying Malicious Domains using DNS Traffic Analysis”, The 19th IEEE International Conference on Dependable, Autonomic, and Secure Computing (DASC), Oct. 25-28, 2021, Calgary, Canada