# Classification of Malwares by PE Headers

![HP](https://store.hp.com/app/assets/images/uploads/prod/how-to-remove-malware-on-windows-pc-hero1581530134837100.png)




In [None]:
import numpy as np
import pandas as pd


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as LGBM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import optuna
from optuna.samplers import TPESampler

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(10,10)})

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter('always')
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv('/kaggle/input/classification-of-malwares/ClaMP_Integrated-5184.csv')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# EDA

In [None]:
col = [i for i in df.columns if df[i].dtype != 'object' ]
file_header = [i for i in df.columns if 'FH_char' in i]
OH_DLL = [i for i in df.columns if 'OH_DLL' in i]

In [None]:
corr = df.corr().stack().reset_index(name="correlation")
g = sns.relplot(
    data=corr,
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
    height=10, sizes=(50, 250), size_norm=(-.2, .8))
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(.02)
for label in g.ax.get_xticklabels():
    label.set_rotation(90)
for artist in g.legend.legendHandles:
    artist.set_edgecolor(".7")

    

In [None]:
!pip install dabl


In [None]:
import dabl
dabl.plot(df,'class')

## Dataset Split

In [None]:
y = df['class']
X = df[col].drop(columns=['class'])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2)

# CatBoost Hyperparameter Tuning

In [None]:
'''import optuna
def objective(trial):

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**param)

    gbm.fit(X_train, y_train, verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15, timeout=600)
trial = study.best_trial
cat_params = trial.params'''

cat_params = {'objective': 'CrossEntropy',
 'colsample_bylevel': 0.07720247769141655,
 'depth': 10,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bayesian',
 'bagging_temperature': 8.769937329955644}

# LGBM Hyperparameter Tuning

In [None]:
'''import optuna
def objective1(trial):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
    dtrain = lgbm.Dataset(X_train, label=y_train)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgbm.train(param, dtrain)
    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy
study1 = optuna.create_study(direction="maximize")
study1.optimize(objective1, n_trials=100)
trial1 = study1.best_trial
lgbm_params = trial1.params'''


lgbm_params = {'lambda_l1': 2.9518538351373698e-08,
 'lambda_l2': 0.07086960074688683,
 'num_leaves': 131,
 'feature_fraction': 0.5523182028151434,
 'bagging_fraction': 0.6074722133685865,
 'bagging_freq': 4,
 'min_child_samples': 26}

In [None]:
cb = CatBoostClassifier(**cat_params)
lgbm = LGBMClassifier(**lgbm_params)

## LGBM Tree

In [None]:
LGBM.plot_tree(lgbm.fit(X_train,y_train),figsize=(100,100))

# Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
est = [('catboost', cb),('lgbm', lgbm)]
sclf = StackingClassifier(estimators=est,final_estimator=cb)
sclf.fit(X_train,y_train);
print('Stacking Completed')


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,sclf.predict(X_test)))