### Feature selection- top 10 IV

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats

data = pd.read_csv('UNSW.csv')

X = data.drop(['id', 'label', 'attack_cat'], axis=1)  # Exclude 'id', 'label', and 'attack_cat' columns
y = data['label']

#one hot encoding the categorical variables
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

iv_threshold = 0.3 
iv_values = []

for col in X_train.columns:
    crosstab = pd.crosstab(X_train[col], y_train)
    chi2, _, _, _ = stats.chi2_contingency(crosstab)
    iv = chi2 / X_train.shape[0]
    if iv > iv_threshold:
        iv_values.append((col, iv))

sorted_iv_values = sorted(iv_values, key=lambda x: x[1], reverse=True)

# Select the top 10 variables with the highest IV values
top_10_variables = [feature for feature, _ in sorted_iv_values[:10]]

print("Top 10 variables with highest IV values:")
for feature, iv in sorted_iv_values[:10]:
    print(f"IV for {feature}: {iv:.4f}")

Top 10 variables with highest IV values:
IV for sload: 0.9558
IV for dur: 0.9166
IV for rate: 0.9141
IV for dload: 0.8384
IV for dinpkt: 0.8230
IV for sbytes: 0.7933
IV for sinpkt: 0.7580
IV for sjit: 0.6902
IV for sttl: 0.6788
IV for dbytes: 0.6721


### model of Logistic Regression 

In [29]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

data = pd.read_csv('UNSW.csv')

top_features = ['sload', 'dur', 'rate', 'dload', 'dinpkt', 'sbytes', 'sinpkt', 'sjit', 'sttl', 'dbytes']

X = data[top_features]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


logistic_regression_model = LogisticRegression()


logistic_regression_model.fit(X_train, y_train)


y_pred = logistic_regression_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)


precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Overall Precision:", precision)
print("Overall Recall:", recall)
print("Overall F1-score:", f1)

Accuracy: 0.837948045282158
Overall Precision: 0.8357667675106653
Overall Recall: 0.837948045282158
Overall F1-score: 0.8364762716587316


#### After Iteration

In [31]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

data = pd.read_csv('UNSW.csv')

#  top 10 features with the highest IV values
top_features = ['sload', 'dur', 'rate', 'dload', 'dinpkt', 'sbytes', 'sinpkt', 'sjit', 'sttl', 'dbytes']
y = data['label']

# Split the data into train:test 80:20 
X_train, X_test, y_train, y_test = train_test_split(data[top_features], y, test_size=0.2, random_state=42)

best_accuracy = 0
best_logistic_regression_model = None

# Define hyperparameters to search
hyperparameters = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

for C in hyperparameters['C']:
    for penalty in hyperparameters['penalty']:
        for solver in hyperparameters['solver']:
            # Initialize the logistic regression model
            logistic_regression_model = LogisticRegression(C=C, penalty=penalty, solver=solver)
            
            # Fit the logistic regression model on the training data
            logistic_regression_model.fit(X_train, y_train)
            
            # Predict labels on the test set using the logistic regression model
            y_pred = logistic_regression_model.predict(X_test)
            
            # Evaluate the model's accuracy
            accuracy = accuracy_score(y_test, y_pred)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_logistic_regression_model = logistic_regression_model


best_y_pred = best_logistic_regression_model.predict(X_test)
classification_rep = classification_report(y_test, best_y_pred, output_dict=True)


overall_precision = classification_rep['weighted avg']['precision']
overall_recall = classification_rep['weighted avg']['recall']
overall_f1_score = classification_rep['weighted avg']['f1-score']

# Print the best accuracy, overall precision, recall, and F1-score
print("Best Accuracy:", best_accuracy)
print("Overall Precision:", overall_precision)
print("Overall Recall:", overall_recall)
print("Overall F1-Score:", overall_f1_score)

Best Accuracy: 0.885112207362628
Overall Precision: 0.886804001985894
Overall Recall: 0.885112207362628
Overall F1-Score: 0.8810820446636886
