## Data cleaning & EDA

In [13]:
import re
import pandas as pd

In [None]:
df_benign = pd.read_csv('../Data/CSV_benign.csv')
df_malware = pd.read_csv('../Data/CSV_malware.csv')

# 'Country' column name is duplicated in malware csv, therefore I decided to rename both. While reading it, pandas reads duplicated column name with '.1' suffix
df_benign.rename(columns={'Country.1':'Country_1'}, inplace=True)
df_malware.rename(columns={'Country.1':'Country_1'}, inplace=True)

# Reindex columns
df_benign = df_benign.reindex(sorted(df_benign.columns), axis=1)
df_malware = df_malware.reindex(sorted(df_malware.columns), axis=1)
# df_malware.head(3)

In [None]:
# By digging into missmatch in dtypes, I was able to identify 'mixing' of columns data in 24 records of df_malware.
# To fix it, following steps are taken: 1. Identify incorrect rows by checking len of IP column,
#   2. Get records into new df
#   3. Rename columns
#   4. Drop incorrect rows from df_malware
#   5. Concatenate fixed data to df_malware

incorrect_rows_idx = df_malware.index[df_malware['IP'].str.len()==2]
df_incorrect_rows = df_malware.iloc[incorrect_rows_idx]

# Applies to df_malware only - rename of columns for 24 records
col_val_replace_to = {
    'Country': 'TTL',
    'TTL': 'Domain',
    'IP': 'Country',
    'Domain': 'IP',
}

df_incorrect_rows.rename(columns=col_val_replace_to, inplace=True) # Apply rename
df_malware.drop(incorrect_rows_idx, axis=0, inplace=True) # Drop from malware df incorrect rows
df_malware = pd.concat([df_malware, df_incorrect_rows], ignore_index=False) # Concatenate fixed data

In [16]:
# To mitigate missing values across similar columns like Domain, Domain_Name and Country, Country_1, following code is applied to df's
# The code also applies mapping to unify a bit entries
countries_map = {
    '-':'',
    "china":"CN",
    "Malaysia":'ID',
    "United States":"US",
    "TURKEY":'TR',
    'RUSSIA':'RU',
    'Russian Federation':'RU',
    'Belarus':'BY',
    'Korea':'KR',
}

def use_regex(input_text):
    return re.sub(r"b'(.+?).'", r"\1", input_text)

def impute_similar_cols(df):
    df["Country_1"].replace(countries_map, inplace=True)
    df["Country"].replace(countries_map, inplace=True)
    df["Country_1"].fillna(df["Country"], inplace=True)
    df["Country"].fillna(df["Country_1"], inplace=True)
    df["Domain_Name"].fillna(df["Domain"].apply(use_regex), inplace=True)
    df['Domain_Age'] = df['Domain_Age'].str.split(' ').str[0]
    return df

df_malware = impute_similar_cols(df_malware)
df_benign = impute_similar_cols(df_benign)

In [17]:
print(df_malware.shape, df_benign.shape)

(4999, 38) (494135, 38)


In [18]:
df_benign['is_threat'] = 0
df_malware['is_threat'] = 1

# Combine two dataframes
df = pd.concat([df_benign, df_malware])
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Manually select informative fields based on # of nulls it contain
percent_na = (df.isna().sum() / len(df)) * 100
columns_to_keep = percent_na[percent_na <= 30].index.tolist()
df = df[columns_to_keep]
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)

print('df shape: ', df.shape)
print(df.columns)

df shape:  (244072, 35)
Index(['1gram', '2gram', '3gram', 'ASN', 'Alexa_Rank', 'Country', 'Country_1',
       'Creation_Date_Time', 'Domain', 'Domain_Age', 'Domain_Name', 'IP',
       'Name_Server_Count', 'Page_Rank', 'Registrar', 'TTL',
       'char_distribution', 'dec_32', 'dec_8', 'entropy', 'hex_32', 'hex_8',
       'len', 'longest_word', 'numeric_percentage', 'obfuscate_at_sign',
       'oc_32', 'oc_8', 'puny_coded', 'shortened', 'sld', 'subdomain', 'tld',
       'typos', 'is_threat'],
      dtype='object')


In [19]:
# df.to_csv('./tmp/test.csv')

In [20]:
# Manual features selection based on previous experiments, numeric ones only
numerical_features = ['Alexa_Rank','ASN','Domain_Age','TTL','entropy','len','numeric_percentage','subdomain']
categorical_features = ['is_threat']
selected_cols = numerical_features + categorical_features

df = df[selected_cols]

def cols_to_num_drop(df, cols):
    for col in cols:
        df.loc[:,col] = pd.to_numeric(df[col], errors='coerce') # Should be number
        df[col] = df[col].astype('float64')
        df.dropna(subset=[col], inplace=True)

    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def cols_to_num_fill(df, cols):
    for col in cols:
        df.loc[:,col] = pd.to_numeric(df[col], errors='coerce') # Should be number
        df.loc[:,col].fillna(0, inplace = True)

    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def cols_to_cat(df, cols):
    for col in cols:
        df[col] = df[col].astype('category')
    return df

df = cols_to_num_drop(df, numerical_features)
df = cols_to_cat(df, categorical_features)
df.shape

(215120, 9)

In [21]:
print(f"{df['is_threat'].value_counts()[1]/df['is_threat'].value_counts()[0]*100:.3f}% of whole dataset domains are a threat")

1.285% of whole dataset domains are a threat


In [22]:
def train_valid_singleframe_proportional(df, ratio = 0.1):
    train = df.iloc[:int(len(df) * (1 - ratio))]
    valid = df.iloc[int(len(df) * (1 - ratio)):]
    return (train, valid)

def train_valid_singleframe_n_last(df, n_samples = 200):
    train = df.iloc[:int(len(df) - n_samples)]
    valid = df.iloc[int(len(df) - n_samples):]
    return (train, valid)

def get_validset(df, type):
    unique_groups = df['is_threat'].unique()
    ret_train = pd.DataFrame()
    ret_valid = pd.DataFrame()

    for group in unique_groups:
        selected_df = df[df['is_threat']==group]

        if type=='n_last':
            train, valid = train_valid_singleframe_n_last(selected_df)
        elif type=='proportional':
            train, valid = train_valid_singleframe_proportional(selected_df)
        else:
            raise('Only "n_last" or "proportional" is available')

        ret_train = pd.concat([ret_train, train], ignore_index=False)
        ret_valid = pd.concat([ret_valid, valid], ignore_index=False)

    ret_train.reset_index(drop=True, inplace=True)
    ret_valid.reset_index(drop=True, inplace=True)
    return (ret_train, ret_valid)


df, df_valid_n_last = get_validset(df, 'n_last')

In [23]:
X = df.drop(columns='is_threat')
y = df['is_threat']

In [24]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

label_encoder = LabelEncoder()
scaler = MinMaxScaler()
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

clf = XGBClassifier(
    scale_pos_weight=99,
    n_estimators=521,
    max_depth=4,
    learning_rate=0.06227012530557216,
    subsample=0.5823180676644505,
    colsample_bytree=0.8679467785467245,
    gamma=14
)

# Initialize variables to store evaluation metrics
accuracy_scores = []
balanced_acc_scores = []
f1_scores = []
roc_aucs = []

# scaler.fit(X)  # Fit on the training data
# X = pd.DataFrame(scaler.transform(X), columns=X.columns)  # Transform the training data

# Train and evaluate the classifier for each split
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]


    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    y_prob = clf.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    accuracy_scores.append(accuracy)
    balanced_acc_scores.append(balanced_acc)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)

# Calculate and print average metrics
avg_accuracy = np.mean(accuracy_scores)
avg_balanced_acc = np.mean(balanced_acc_scores)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_aucs)

print(f"Average Accuracy: {avg_accuracy:.2f}")
print(f"Average Balanced Accuracy: {avg_balanced_acc:.2f}")
print(f"Average F1 Score: {avg_f1:.2f}")
print(f"Average ROC AUC: {avg_roc_auc:.2f}")

Average Accuracy: 0.82
Average Balanced Accuracy: 0.84
Average F1 Score: 0.89
Average ROC AUC: 0.92


In [25]:
X_valid = df_valid_n_last.drop(columns='is_threat')
y_valid = df_valid_n_last['is_threat']

In [26]:
# X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X_valid.columns)
y_val_pred = clf.predict(X_valid)

val_accuracy = accuracy_score(y_valid, y_val_pred)
val_balanced_acc = balanced_accuracy_score(y_valid, y_val_pred)
val_f1 = f1_score(y_valid, y_val_pred, average='weighted')
y_val_prob = clf.predict_proba(X_valid)[:, 1]
val_roc_auc = roc_auc_score(y_valid, y_val_prob)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation Balanced Accuracy: {val_balanced_acc:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")
print(f"Validation ROC AUC: {val_roc_auc:.2f}")

Validation Accuracy: 0.80
Validation Balanced Accuracy: 0.80
Validation F1 Score: 0.80
Validation ROC AUC: 0.90


## References

Samaneh Mahdavifar, Nasim Maleki, Arash Habibi Lashkari, Matt Broda, Amir H. Razavi, “Classifying Malicious Domains using DNS Traffic Analysis”, The 19th IEEE International Conference on Dependable, Autonomic, and Secure Computing (DASC), Oct. 25-28, 2021, Calgary, Canada