### Pre-processing Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

df = pd.read_csv('https://raw.githubusercontent.com/pongsapaks/Data-science/main/log2.csv')

X = df.drop(['Action'], axis=1)
y = df['Action']

#data with no EDA
X_trainBasic, X_testBasic, y_trainBasic, y_testBasic = train_test_split(X, y,train_size=0.8, test_size=0.2, random_state=42)

## handle imbalanced data
sm = SMOTE(sampling_strategy='auto')
X_resampled, y_resampled = sm.fit_resample(X, y)

## features selection
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=4)
sel.fit(X_resampled, y_resampled)
selected_feat= X_resampled.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)
 
X_tree = X_resampled.loc[:,selected_feat]
X_tree

## Split data for learning 
X_trainTree, X_testTree, y_trainTree, y_testTree = train_test_split(X_tree, y_resampled,train_size=0.8, test_size=0.2, random_state=42)

4
Index(['Destination Port', 'Bytes', 'Bytes Sent', 'Elapsed Time (sec)'], dtype='object')


### Training data NO EDA, NO Gridsearch
* no imbalance handling
* no feature selection

In [3]:
## training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_trainBasic, y_trainBasic)

## find y predict
y_pred = clf.predict(X_testBasic)

#measure accuracy
accuracy = accuracy_score(y_testBasic, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_testBasic, y_pred))

Accuracy: 0.9984740978103304
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      7545
        deny       1.00      1.00      1.00      2994
        drop       1.00      1.00      1.00      2562
  reset-both       1.00      0.67      0.80         6

    accuracy                           1.00     13107
   macro avg       1.00      0.92      0.95     13107
weighted avg       1.00      1.00      1.00     13107



### Training EDA data without Gridsearch

In [4]:
## training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_trainTree, y_trainTree)

## find y predict
y_pred = clf.predict(X_testTree)

#measure accuracy
accuracy = accuracy_score(y_testTree, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_testTree, y_pred))

Accuracy: 0.9889080765143464
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      7526
        deny       0.98      0.97      0.98      7503
        drop       1.00      1.00      1.00      7528
  reset-both       0.98      0.98      0.98      7555

    accuracy                           0.99     30112
   macro avg       0.99      0.99      0.99     30112
weighted avg       0.99      0.99      0.99     30112



### Train data using Gridsearch no pruning

In [None]:
#Forest EDA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_trainTree, y_trainTree)

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)
print(classification_report(y_testTree, y_pred))

KeyboardInterrupt: ignored

### Pipeline

In [None]:
#Forest EDA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Define the pipeline with a preprocessing step and a Random Forest Classifier estimator
pipe = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])
# Define the parameter grid for Grid Search
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 5, 10],
    'classifier__max_features': ['auto', 'sqrt']
}

# Create a Grid Search object with the pipeline
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)

# Train the model on the training data
grid_search.fit(X_trainTree, y_trainTree)

# Predict the classes of the test data using the best estimator found by Grid Search
y_pred = grid_search.best_estimator_.predict(X_testTree)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_testTree, y_pred)
# Print the best hyperparameters found by Grid Search, as well as the accuracy of the model
print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print(classification_report(y_testTree, y_pred))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best hyperparameters: {'classifier__max_depth': None, 'classifier__max_features': 'auto', 'classifier__n_estimators': 50}
Accuracy: 0.9889744952178533
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      7526
        deny       0.98      0.97      0.98      7503
        drop       1.00      1.00      1.00      7528
  reset-both       0.98      0.98      0.98      7555

    accuracy                           0.99     30112
   macro avg       0.99      0.99      0.99     30112
weighted avg       0.99      0.99      0.99     30112



### Forest with EDA with Prunning to decrease overfitting
 * using ccp_alpha. The accuracy decreases from 0.98 to 0.91

In [None]:
#forest with EDA #with Prunning to decrease overfitting #ccp_alpha #it decrease accuracy from 0.98 to 0.91

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt'],
    'ccp_alpha': [0.0175]
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create a Grid Search object
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)

# Train the model on the training data
grid_search.fit(X_trainTree, y_trainTree)

# Predict the classes of the test data using the best estimator found by Grid Search
y_pred = grid_search.best_estimator_.predict(X_testTree)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_testTree, y_pred)

# Print the best hyperparameters and accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print(classification_report(y_testTree, y_pred))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best hyperparameters: {'ccp_alpha': 0.0175, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 50}
Accuracy: 0.9117295430393199
              precision    recall  f1-score   support

       allow       0.99      1.00      0.99      7526
        deny       0.84      0.80      0.82      7503
        drop       1.00      1.00      1.00      7528
  reset-both       0.82      0.85      0.83      7555

    accuracy                           0.91     30112
   macro avg       0.91      0.91      0.91     30112
weighted avg       0.91      0.91      0.91     30112



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_trainTree, y_trainTree)

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)


Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.9886507115468666


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# calculate the confusion matrix for the best classifier
cm = confusion_matrix(y_testTree, y_pred)

# plot the confusion matrix as a heatmap
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# get the best classifier from the dictionary
best_clf = grid_search.best_estimator_

# calculate the predicted probabilities for each class using the classifier object
y_prob = best_clf.predict_proba(X_testTree)

# calculate the multiclass AUC score
roc_auc = roc_auc_score(y_testTree, y_prob, multi_class='ovr')
roc_auc