## Import libraries

In [181]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

In [182]:
df = pd.read_csv('../../dataset_train.csv')
df = df.drop(columns=["label"])
df_test = pd.read_csv('../../dataset_test.csv')
df.head()
df_test.head()

Unnamed: 0,id,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,...,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,0,tcp,FIN,0.45498,534.0,268.0,254.0,252.0,2.0,1.0,...,0.0,0.0,0.0,5.0,5.0,2.0,2.0,2.0,1.0,2.0
1,1,tcp,FIN,0.648037,8854.0,268.0,254.0,252.0,4.0,1.0,...,0.0,,0.0,6.0,6.0,1.0,1.0,1.0,1.0,5.0
2,2,tcp,FIN,1.120856,3440.0,642.0,254.0,252.0,5.0,3.0,...,0.0,0.0,0.0,4.0,4.0,1.0,2.0,1.0,1.0,4.0
3,3,udp,INT,1e-06,244.0,0.0,254.0,,0.0,0.0,...,0.0,0.0,0.0,10.0,4.0,2.0,4.0,2.0,1.0,4.0
4,4,tcp,FIN,0.264763,1540.0,1644.0,31.0,29.0,4.0,4.0,...,,0.0,0.0,13.0,11.0,10.0,7.0,6.0,1.0,7.0


In [183]:
categorical_features = ['proto', 'state', 'service','is_sm_ips_ports','is_ftp_login','attack_cat']
noncategorical_features = [col for col in df.columns.tolist() if col not in categorical_features]

In [184]:
original_train = df.copy()
le_attack_cat = LabelEncoder()
df['attack_cat'] = le_attack_cat.fit_transform(df['attack_cat'])

train_set, val_set = train_test_split(df, test_size=0.2, random_state=42)

In [185]:
from sklearn.impute import SimpleImputer

class FeatureImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean', fill_value=None):
        """
        Initialize the imputer for handling missing values.

        :param strategy: The strategy to use for imputation ('mean', 'median', 'most_frequent', 'constant').
                         Default is 'mean'.
        :param fill_value: The value to use for the 'constant' strategy. Default is None.
        """
        self.strategy = strategy
        self.fill_value = fill_value
        self.imputer = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)

    def fit(self, X):
        """
        Fit the imputer to the data.

        :param X: Features data with missing values
        """
        self.imputer.fit(X)

    def transform(self, X):
        """
        Transform the data by imputing the missing values.

        :param X: Features data with missing values
        :return: Data with missing values imputed
        """
        return self.imputer.transform(X)

    def fit_transform(self, X, y=None):
        """
        Fit the imputer and transform the data.

        :param X: Features data with missing values
        :return: Data with missing values imputed
        """
        return self.imputer.fit_transform(X)

    def get_imputation_statistics(self):
        """
        Get the imputation statistics (e.g., mean or median values used for imputation).

        :return: The statistics used for imputation (depending on the strategy)
        """
        return self.imputer.statistics_

In [186]:
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=0.01, upper_percentile=0.99):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile

    def fit(self, X, y=None):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        self.lower_bounds = np.percentile(X, self.lower_percentile * 100, axis=0)
        self.upper_bounds = np.percentile(X, self.upper_percentile * 100, axis=0)
        return self

    def transform(self, X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        return np.clip(X, self.lower_bounds, self.upper_bounds)

In [187]:
class FeatureScaling(BaseEstimator, TransformerMixin):
    def __init__(self, method="standard"):
        self.method = method
        self.scaler = None

    def fit(self, X, y=None):
        if self.method == "standard":
            self.scaler = StandardScaler().fit(X)
        elif self.method == "minmax":
            self.scaler = MinMaxScaler().fit(X)
        return self

    def transform(self, X):
        return self.scaler.transform(X) if self.scaler else X


In [188]:
class FeatureDiscretizer(BaseEstimator, TransformerMixin):
    def __init__(self, features, bins=10, strategy='uniform'):
        self.features = features
        self.bins = bins
        self.strategy = strategy
        self.discretizers = {}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X, columns=self.features)

        for feature in self.features:
            if self.strategy == 'uniform':
                discretizer = np.linspace(X_df[feature].min(), X_df[feature].max(), self.bins + 1)
            elif self.strategy == 'quantile':
                discretizer = np.quantile(X_df[feature], np.linspace(0, 1, self.bins + 1))
            else:
                raise ValueError(f"Invalid strategy: {self.strategy}")

            self.discretizers[feature] = discretizer

        return self

    def transform(self, X):
        X_df = pd.DataFrame(X, columns=self.features)

        for feature in self.features:
            discretizer = self.discretizers[feature]
            X_df[feature] = pd.cut(X_df[feature], bins=discretizer, labels=False, include_lowest=True, duplicates='drop')

        return X_df.values

In [189]:
from imblearn.pipeline import Pipeline as ImbPipeline


categorical_without_target = [x for x in categorical_features if x != 'attack_cat']
onehot_features = ['service', 'proto']
label_features = ['state']

numeric_transformer_id3 = Pipeline(steps=[
    ('imputer', FeatureImputer(strategy='median')),
    ('outlier_clipper', OutlierClipper(lower_percentile=0.01, upper_percentile=0.99)),
    ('discretizer', FeatureDiscretizer(features=noncategorical_features, bins=10, strategy='uniform')),
    ('scaler', FeatureScaling(method='standard'))
])

categorical_transformer_id3 = Pipeline(steps=[
    ('imputer', FeatureImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor_id3 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_id3, noncategorical_features),
        ('cat', categorical_transformer_id3, categorical_without_target)
    ]
)

pipe_id3 = ImbPipeline([
    ('preprocessor', preprocessor_id3),
])

In [190]:
x_train_set_id3 = train_set.drop('attack_cat', axis=1)
y_train_set_id3 = train_set['attack_cat']
x_val_set_id3 = val_set.drop('attack_cat', axis=1)
y_val_set_id3 = val_set['attack_cat']
x_train_set_processed_id3 = pipe_id3.fit_transform(x_train_set_id3, y_train_set_id3)

x_val_set_processed_id3 = pipe_id3.transform(x_val_set_id3)



In [174]:
from sklearn.tree import DecisionTreeClassifier

# Inisialisasi DecisionTreeClassifier dengan kriteria 'entropy'
id3 = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Fit model ke data latih
id3.fit(x_train_set_processed_id3, y_train_set_id3)

# Prediksi data validasi
y_pred = id3.predict(x_val_set_processed_id3)

# Hitung akurasi
accuracy = accuracy_score(y_val_set_id3, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Matriks kebingungan (confusion matrix)
conf_matrix = confusion_matrix(y_val_set_id3, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

# Laporan klasifikasi
class_report = classification_report(y_val_set_id3, y_pred, zero_division=0)
print(f"Classification Report:\n{class_report}")


Accuracy: 0.7354
Confusion Matrix:
[[  49   18   99  165   12    3   27   10    0    1]
 [  19   21  100  176   23    2    8   14    4    0]
 [ 112   98  775 1210   87   29   50   80   16    2]
 [ 150  143 1107 4330  298   77  196  303   18   14]
 [  19   40  144  431 1744   20  824  361   53    1]
 [   3    1   43   82   26 7803    7   10    0    0]
 [  31    5   57  196  923   17 9840  158   27    0]
 [   5   20  127  372  239    7   98 1190   10    2]
 [   0    3   13   23   68    1   34   81   36    0]
 [   0    0    1   13    2    0    1    8    1    2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.13      0.13       384
           1       0.06      0.06      0.06       367
           2       0.31      0.32      0.31      2459
           3       0.62      0.65      0.64      6636
           4       0.51      0.48      0.49      3637
           5       0.98      0.98      0.98      7975
           6       0.89      0.8

In [None]:
from collections import Counter


class ID3DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_gain=1e-4):
        """
        Initialize the ID3 Decision Tree.

        Parameters:
        - max_depth: Maximum depth of the tree (default: None, meaning no limit).
        - min_samples_split: Minimum samples required to split a node.
        - min_gain: Minimum information gain required for a split.
        - use_gini: Use Gini Impurity instead of Entropy.
        """
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_gain = min_gain
        self.tree = None

    def entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def information_gain(self, X_column, y):
        parent_impurity = self.entropy(y)
        values, counts = np.unique(X_column, return_counts=True)

        weighted_impurity = np.sum(
            [(counts[i] / len(X_column)) * self.entropy(y[X_column == value])
             for i, value in enumerate(values)]
        )
        return parent_impurity - weighted_impurity

    def best_split(self, X, y):
        best_gain = -1
        best_feature = None

        for feature in range(X.shape[1]):
            gain = self.information_gain(X[:, feature], y)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature

        if best_gain < self.min_gain:
            return None
        return best_feature

    def build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1:
            return y[0]
        if self.max_depth is not None and depth >= self.max_depth:
            return Counter(y).most_common(1)[0][0]
        if len(y) < self.min_samples_split:
            return Counter(y).most_common(1)[0][0]

        feature = self.best_split(X, y)
        if feature is None:
            return Counter(y).most_common(1)[0][0]

        tree = {feature: {}}
        for value in np.unique(X[:, feature]):
            sub_X = X[X[:, feature] == value]
            sub_y = y[X[:, feature] == value]
            subtree = self.build_tree(sub_X, sub_y, depth + 1)
            tree[feature][value] = subtree

        return tree

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, tree, sample):
        if not isinstance(tree, dict):
            return tree
        feature = next(iter(tree))
        value = sample[feature]
        subtree = tree[feature].get(value)
        if subtree is None:
            return Counter(self.get_all_leaves(tree)).most_common(1)[0][0]
        return self.predict_sample(subtree, sample)

    def get_all_leaves(self, tree):
        if not isinstance(tree, dict):
            return [tree]
        leaves = []
        for subtree in tree.values():
            leaves.extend(self.get_all_leaves(subtree))
        return leaves

    def predict(self, X):
        return np.array([self.predict_sample(self.tree, sample) for sample in X])

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)

        print(f"Accuracy: {accuracy:.8f}")
        print(f"Precision: {precision:.8f}")
        print(f"Recall: {recall:.8f}")
        return y_pred


dtl = ID3DecisionTree(max_depth=5)
dtl.fit(x_train_set_processed_id3, y_train_set_id3.values)
# Prediksi data validasi
y_pred = dtl.predict(x_val_set_processed_id3)

# Hitung akurasi
accuracy = accuracy_score(y_val_set_id3, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Matriks kebingungan (confusion matrix)
conf_matrix = confusion_matrix(y_val_set_id3, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

# Laporan klasifikasi
class_report = classification_report(y_val_set_id3, y_pred, zero_division=0)
print(f"Classification Report:\n{class_report}")

Accuracy: 0.7346
Confusion Matrix:
[[  27    7   72  257    6    1   10    4    0    0]
 [   6    5   65  226   28    1   23   13    0    0]
 [  21   35  431 1644  118   17  113   76    4    0]
 [  45   41  575 5101  340   23  360  146    5    0]
 [  10    8   57  384 1932   22  638  572   14    0]
 [   1    0    8  102   40 7764   17   43    0    0]
 [   3    0    0  269 1175   11 9381  412    3    0]
 [   4    6   62  527  296    2   75 1098    0    0]
 [   0    0    0    0   78    0   70   90   21    0]
 [   0    0    0   24    2    0    1    1    0    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.07      0.11       384
           1       0.05      0.01      0.02       367
           2       0.34      0.18      0.23      2459
           3       0.60      0.77      0.67      6636
           4       0.48      0.53      0.50      3637
           5       0.99      0.97      0.98      7975
           6       0.88      0.8

## EXPORT MODEL

In [202]:
import pickle
with open('../../model-id3.pkl', 'wb') as file:
    pickle.dump(dtl, file)

## IMPORT MODEL

In [203]:
with open ("../../model-id3.pkl", "rb") as file:
    loaded_model = pickle.load(file)

In [204]:
training_set = df.copy()
x_training_set = training_set.drop('attack_cat', axis=1)
y_training_set = training_set['attack_cat']

x_test_set = df_test.copy()

x_training_set_processed = pipe_id3.fit_transform(x_training_set, y_training_set)
x_test_set_processed = pipe_id3.transform(x_test_set)

loaded_model.fit(x_training_set_processed, y_training_set.values)






In [205]:
y_test_predict = loaded_model.predict(x_test_set_processed)


In [206]:
print(y_test_predict)
reversed = le_attack_cat.inverse_transform(y_test_predict)
print(reversed)

result_df = pd.DataFrame({
    "id": range(len(reversed)),
    "attack_cat": reversed
})

result_df.head()
result_df.to_csv("predictions.csv", index=False)


[6 6 4 ... 5 2 2]
['Normal' 'Normal' 'Fuzzers' ... 'Generic' 'DoS' 'DoS']
