# Individual Assignment: Linear Classifier
## Network Intrusion Detection using Linear Models

**Author:** Muhammad Usama Fazal  
**TP Number:** TP086008  

**Classifier Category:** Linear  
**Algorithms Evaluated:** Linear Discriminant Analysis (LDA), Logistic Regression, Ridge Classifier  
**Dataset:** NSL-KDD (Boosted Train + Preprocessed Test)  
**Classification:** Multi-class (5 attack categories)

---
## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import warnings
warnings.filterwarnings('ignore')

import os
data_path = '../data'



In [2]:
# Import local library (provided helper functions)
import sys
if "../.." not in sys.path:
    sys.path.insert(0, '..')

from mylib import show_labels_dist, show_metrics, bias_var_metrics

In [3]:
# Additional imports for models and evaluation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, matthews_corrcoef, confusion_matrix,
                             classification_report, ConfusionMatrixDisplay,
                             roc_curve, auc, roc_auc_score)
import json

---
## 2. Load Dataset

In [4]:
# Load Boosted Train and Preprocessed Test datasets
data_file = os.path.join(data_path, 'NSL_boosted-2.csv')
train_df = pd.read_csv(data_file)
print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))

data_file = os.path.join(data_path, 'NSL_ppTest.csv')
test_df = pd.read_csv(data_file)
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))

Train Dataset: 63280 rows, 43 columns
Test Dataset: 22544 rows, 43 columns


In [5]:
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,atakcat
0,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.03,0.05,0.0,0.0,1.0,0.37,normal,benign
1,0,tcp,ftp_data,SF,190,0,0,0,0,0,...,0.22,0.03,0.22,0.0,0.0,0.0,0.0,0.0,normal,benign
2,0,tcp,nnsp,S0,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune,dos
3,0,tcp,http,SF,207,342,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,benign
4,0,icmp,ecr_i,SF,1480,0,0,1,0,0,...,0.07,0.08,0.07,0.0,0.54,0.0,0.01,0.0,pod,dos


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63280 entries, 0 to 63279
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     63280 non-null  int64  
 1   protocol_type                63280 non-null  object 
 2   service                      63280 non-null  object 
 3   flag                         63280 non-null  object 
 4   src_bytes                    63280 non-null  int64  
 5   dst_bytes                    63280 non-null  int64  
 6   land                         63280 non-null  int64  
 7   wrong_fragment               63280 non-null  int64  
 8   urgent                       63280 non-null  int64  
 9   hot                          63280 non-null  int64  
 10  num_failed_logins            63280 non-null  int64  
 11  logged_in                    63280 non-null  int64  
 12  num_compromised              63280 non-null  int64  
 13  root_shell      

---
## 3. Data Preparation

In [7]:
# Check numeric features consistency
trnn = train_df.select_dtypes(include=['float64','int64']).columns
tstn = test_df.select_dtypes(include=['float64','int64']).columns
trndif = np.setdiff1d(trnn, tstn)
tstdif = np.setdiff1d(tstn, trnn)

print("Numeric features in train_set not in test_set: ", 'None' if len(trndif) == 0 else trndif)
print("Numeric features in test_set not in train_set: ", 'None' if len(tstdif) == 0 else tstdif)

Numeric features in train_set not in test_set:  None
Numeric features in test_set not in train_set:  None


In [8]:
# Check categorical features consistency
trnn = train_df.select_dtypes(include=['object']).columns
tstn = test_df.select_dtypes(include=['object']).columns
print("Categorical features in train:", trnn.tolist())
print("Categorical features in test:", tstn.tolist())

Categorical features in train: ['protocol_type', 'service', 'flag', 'label', 'atakcat']
Categorical features in test: ['protocol_type', 'service', 'flag', 'label', 'atakcat']


In [9]:
# Check for missing values
print('Missing Values - Train Set:', train_df.isnull().sum().sum())
print('Missing Values - Test Set:', test_df.isnull().sum().sum())

Missing Values - Train Set: 0
Missing Values - Test Set: 0


In [10]:
# Combine datasets for consistent preprocessing
combined_df = pd.concat([train_df, test_df])
print('Combined Dataset: {} rows, {} columns'.format(combined_df.shape[0], combined_df.shape[1]))

Combined Dataset: 85824 rows, 43 columns


In [11]:
# Check label distributions
print("Label distribution in combined dataset:")
print(combined_df['label'].value_counts())
print("\nAttack category distribution:")
print(combined_df['atakcat'].value_counts())

Label distribution in combined dataset:
label
normal             43383
neptune            25264
satan               2552
smurf               1988
ipsweep             1941
portsweep           1623
guess_passwd        1257
mscan               1046
warezmaster          954
back                 837
nmap                 820
apache2              774
processtable         719
teardrop             458
warezclient          445
snmpguess            364
saint                350
mailbomb             322
snmpgetattack        196
httptunnel           146
pod                  142
buffer_overflow       35
named                 25
ps                    22
multihop              21
sendmail              21
xterm                 20
rootkit               18
land                  16
xlock                 14
xsnoop                 8
ftp_write              7
imap                   6
loadmodule             6
phf                    6
perl                   5
worm                   4
sqlattack              4
udps

In [12]:
# IMPORTANT: Set classification target to MULTI-CLASS (5 attack categories)
# twoclass = False means we use 'atakcat' column with 5 classes:
# benign, dos, probe, r2l, u2r

twoclass = False  # CHANGED FROM True TO False FOR MULTI-CLASS

if twoclass:
    labels_df = combined_df['label'].copy()
    labels_df[labels_df != 'normal'] = 'attack'
else:
    # Multi-class: Use attack categories (5 classes)
    labels_df = combined_df['atakcat'].copy()

# Drop target features from feature set
combined_df.drop(['label'], axis=1, inplace=True)
combined_df.drop(['atakcat'], axis=1, inplace=True)

print(f"Classification type: {'Binary' if twoclass else 'Multi-class (5 categories)'}")
print(f"\nClass distribution:")
print(labels_df.value_counts())

Classification type: Multi-class (5 categories)

Class distribution:
atakcat
benign    43383
dos       30524
probe      8332
r2l        3329
u2r         256
Name: count, dtype: int64


In [13]:
# One-Hot Encoding categorical features
categori = combined_df.select_dtypes(include=['object']).columns
category_cols = categori.tolist()
print("Categorical columns to encode:", category_cols)

Categorical columns to encode: ['protocol_type', 'service', 'flag']


In [14]:
features_df = pd.get_dummies(combined_df, columns=category_cols)
print('Features after encoding: {} columns'.format(features_df.shape[1]))

Features after encoding: 122 columns


In [15]:
# Get numeric columns for scaling
numeri = combined_df.select_dtypes(include=['float64','int64']).columns
print("Numeric columns for scaling:", len(numeri.tolist()), "features")

Numeric columns for scaling: 38 features


In [16]:
# Restore train/test split
X_train = features_df.iloc[:len(train_df),:].copy()
X_train.reset_index(inplace=True, drop=True)
X_test = features_df.iloc[len(train_df):,:].copy()
X_test.reset_index(inplace=True, drop=True)

y_train = labels_df[:len(train_df)].copy()
y_train.reset_index(inplace=True, drop=True)
y_test = labels_df[len(train_df):].copy()
y_test.reset_index(inplace=True, drop=True)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (63280, 122), y_train: (63280,)
X_test: (22544, 122), y_test: (22544,)


In [17]:
# Apply MinMaxScaler (fit on train, transform both)
for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))
    
    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))

print("Scaling completed using MinMaxScaler (0-1 range)")

Scaling completed using MinMaxScaler (0-1 range)


In [18]:
# Save original datasets before optimization
X_train_original = X_train.copy()
X_test_original = X_test.copy()
y_train_original = y_train.copy()

# Define class labels for multi-class
class_labels = ['benign', 'dos', 'probe', 'r2l', 'u2r']
print(f"Class labels: {class_labels}")

Class labels: ['benign', 'dos', 'probe', 'r2l', 'u2r']


In [19]:
# Show label distribution
show_labels_dist(X_train, X_test, y_train, y_test)

features_train: 63280 rows, 122 columns
features_test:  22544 rows, 122 columns

labels_train: 63280 rows, 1 column
labels_test:  22544 rows, 1 column

Frequency and Distribution of labels
         atakcat  %_train  atakcat  %_test
atakcat                                   
benign     33672    53.21     9711   43.08
dos        23066    36.45     7458   33.08
probe       5911     9.34     2421   10.74
r2l          575     0.91     2754   12.22
u2r           56     0.09      200    0.89


---
## 4. BASELINE MODEL COMPARISON

### Linear Algorithms to Evaluate:
1. **Linear Discriminant Analysis (LDA)** - Dimensionality reduction + classification
2. **Logistic Regression** - Probabilistic linear classifier
3. **Ridge Classifier** - L2-regularized linear classifier

We will compare all three baselines and select the best one for optimization.

### 4.1 Baseline 1: Linear Discriminant Analysis (LDA)

In [20]:
print("="*60)
print("BASELINE 1: LINEAR DISCRIMINANT ANALYSIS (LDA)")
print("="*60)

# Create baseline model with default parameters
lda_baseline = LinearDiscriminantAnalysis()
print("Model:", lda_baseline)
print("\nDefault Parameters:", lda_baseline.get_params())

# Train and evaluate
trs = time()
lda_baseline.fit(X_train, y_train)
y_pred_lda = lda_baseline.predict(X_test)
lda_train_time = time() - trs

print(f"\nTraining Time: {lda_train_time:.2f} seconds\n")
show_metrics(y_test, y_pred_lda, class_labels)

BASELINE 1: LINEAR DISCRIMINANT ANALYSIS (LDA)
Model: LinearDiscriminantAnalysis()

Default Parameters: {'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}

Training Time: 1.67 seconds

              pred:benign  pred:dos  pred:probe  pred:r2l  pred:u2r
train:benign         9308        85         280        22        16
train:dos            1327      5607         524         0         0
train:probe           497       176        1748         0         0
train:r2l            2079         0          16       649        10
train:u2r             155         0           0        10        35

~~~~
      benign :  FPR = 0.316   FNR = 0.041
         dos :  FPR = 0.017   FNR = 0.248
       probe :  FPR = 0.041   FNR = 0.278
         r2l :  FPR = 0.002   FNR = 0.764
         u2r :  FPR = 0.001   FNR = 0.825

   macro avg :  FPR = 0.075   FNR = 0.431
weighted avg :  FPR = 0.058   FNR = 0.231

~~~~
    

'~~~~'

In [21]:
# Store LDA baseline metrics
lda_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_lda),
    'f1_weighted': f1_score(y_test, y_pred_lda, average='weighted'),
    'f1_macro': f1_score(y_test, y_pred_lda, average='macro'),
    'mcc': matthews_corrcoef(y_test, y_pred_lda),
    'train_time': lda_train_time
}
print("LDA Baseline Metrics:", lda_metrics)

LDA Baseline Metrics: {'accuracy': 0.7694730305180979, 'f1_weighted': 0.749670783119208, 'f1_macro': 0.5990038319555226, 'mcc': 0.6643994296610017, 'train_time': 1.671619176864624}


### 4.2 Baseline 2: Logistic Regression

In [22]:
print("="*60)
print("BASELINE 2: LOGISTIC REGRESSION")
print("="*60)

# Create baseline model with default parameters
# Using class_weight='balanced' due to severe class imbalance (U2R: 0.09%)
lr_baseline = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
print("Model:", lr_baseline)
print("\nKey Parameters:")
print(f"  - max_iter: 1000 (increased for convergence)")
print(f"  - class_weight: balanced (handles imbalance)")

# Train and evaluate
trs = time()
lr_baseline.fit(X_train, y_train)
y_pred_lr = lr_baseline.predict(X_test)
lr_train_time = time() - trs

print(f"\nTraining Time: {lr_train_time:.2f} seconds\n")
show_metrics(y_test, y_pred_lr, class_labels)

BASELINE 2: LOGISTIC REGRESSION
Model: LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

Key Parameters:
  - max_iter: 1000 (increased for convergence)
  - class_weight: balanced (handles imbalance)

Training Time: 18.18 seconds

              pred:benign  pred:dos  pred:probe  pred:r2l  pred:u2r
train:benign         8499       101         667       361        83
train:dos             901      6125          37       345        50
train:probe            84        74        2177        25        61
train:r2l             817         5           2      1482       448
train:u2r               7         0           0        11       182

~~~~
      benign :  FPR = 0.141   FNR = 0.125
         dos :  FPR = 0.012   FNR = 0.179
       probe :  FPR = 0.035   FNR = 0.101
         r2l :  FPR = 0.037   FNR = 0.462
         u2r :  FPR = 0.029   FNR = 0.090

   macro avg :  FPR = 0.051   FNR = 0.191
weighted avg :  FPR = 0.045   FNR = 0.181

~~~~
              precision    r

'~~~~'

In [23]:
# Store Logistic Regression baseline metrics
lr_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'f1_weighted': f1_score(y_test, y_pred_lr, average='weighted'),
    'f1_macro': f1_score(y_test, y_pred_lr, average='macro'),
    'mcc': matthews_corrcoef(y_test, y_pred_lr),
    'train_time': lr_train_time
}
print("Logistic Regression Baseline Metrics:", lr_metrics)

Logistic Regression Baseline Metrics: {'accuracy': 0.8190649396735273, 'f1_weighted': 0.8242514208265618, 'f1_macro': 0.702187885106248, 'mcc': 0.738369795439805, 'train_time': 18.179671049118042}


### 4.3 Baseline 3: Ridge Classifier

In [24]:
print("="*60)
print("BASELINE 3: RIDGE CLASSIFIER")
print("="*60)

# Create baseline model with default parameters
ridge_baseline = RidgeClassifier(class_weight='balanced', random_state=42)
print("Model:", ridge_baseline)
print("\nKey Parameters:")
print(f"  - alpha: 1.0 (default regularization)")
print(f"  - class_weight: balanced")

# Train and evaluate
trs = time()
ridge_baseline.fit(X_train, y_train)
y_pred_ridge = ridge_baseline.predict(X_test)
ridge_train_time = time() - trs

print(f"\nTraining Time: {ridge_train_time:.2f} seconds\n")
show_metrics(y_test, y_pred_ridge, class_labels)

BASELINE 3: RIDGE CLASSIFIER
Model: RidgeClassifier(class_weight='balanced', random_state=42)

Key Parameters:
  - alpha: 1.0 (default regularization)
  - class_weight: balanced

Training Time: 0.58 seconds

              pred:benign  pred:dos  pred:probe  pred:r2l  pred:u2r
train:benign         8242        87         617       610       155
train:dos            1195      5534          20       142       567
train:probe            37       168        1898        29       289
train:r2l             766         1          23      1530       434
train:u2r               0         0           1        16       183

~~~~
      benign :  FPR = 0.156   FNR = 0.151
         dos :  FPR = 0.017   FNR = 0.258
       probe :  FPR = 0.033   FNR = 0.216
         r2l :  FPR = 0.040   FNR = 0.444
         u2r :  FPR = 0.065   FNR = 0.085

   macro avg :  FPR = 0.062   FNR = 0.231
weighted avg :  FPR = 0.057   FNR = 0.229

~~~~
              precision    recall  f1-score   support

      benign      0.80

'~~~~'

In [25]:
# Store Ridge baseline metrics
ridge_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_ridge),
    'f1_weighted': f1_score(y_test, y_pred_ridge, average='weighted'),
    'f1_macro': f1_score(y_test, y_pred_ridge, average='macro'),
    'mcc': matthews_corrcoef(y_test, y_pred_ridge),
    'train_time': ridge_train_time
}
print("Ridge Classifier Baseline Metrics:", ridge_metrics)

Ridge Classifier Baseline Metrics: {'accuracy': 0.7712473385379702, 'f1_weighted': 0.7894896189683033, 'f1_macro': 0.6452765153006731, 'mcc': 0.6755067909070548, 'train_time': 0.5796947479248047}


### 4.4 Baseline Comparison Summary

In [26]:
# Create comparison table
baseline_comparison = pd.DataFrame({
    'Algorithm': ['LDA', 'Logistic Regression', 'Ridge Classifier'],
    'Accuracy': [lda_metrics['accuracy'], lr_metrics['accuracy'], ridge_metrics['accuracy']],
    'F1 (Weighted)': [lda_metrics['f1_weighted'], lr_metrics['f1_weighted'], ridge_metrics['f1_weighted']],
    'F1 (Macro)': [lda_metrics['f1_macro'], lr_metrics['f1_macro'], ridge_metrics['f1_macro']],
    'MCC': [lda_metrics['mcc'], lr_metrics['mcc'], ridge_metrics['mcc']],
    'Train Time (s)': [lda_metrics['train_time'], lr_metrics['train_time'], ridge_metrics['train_time']]
})

print("\n" + "="*70)
print("BASELINE COMPARISON: LINEAR CLASSIFIERS")
print("="*70)
print(baseline_comparison.to_string(index=False))


BASELINE COMPARISON: LINEAR CLASSIFIERS
          Algorithm  Accuracy  F1 (Weighted)  F1 (Macro)      MCC  Train Time (s)
                LDA  0.769473       0.749671    0.599004 0.664399        1.671619
Logistic Regression  0.819065       0.824251    0.702188 0.738370       18.179671
   Ridge Classifier  0.771247       0.789490    0.645277 0.675507        0.579695


In [27]:
# Calculate MCC per class for each baseline
print("\n" + "="*70)
print("MCC PER ATTACK CLASS")
print("="*70)

def calculate_mcc_per_class(y_true, y_pred, classes):
    """Calculate MCC for each class (one-vs-rest)"""
    mcc_dict = {}
    for cls in classes:
        mcc_dict[cls] = matthews_corrcoef(y_true == cls, y_pred == cls)
    return mcc_dict

lda_mcc_class = calculate_mcc_per_class(y_test, y_pred_lda, class_labels)
lr_mcc_class = calculate_mcc_per_class(y_test, y_pred_lr, class_labels)
ridge_mcc_class = calculate_mcc_per_class(y_test, y_pred_ridge, class_labels)

mcc_per_class_df = pd.DataFrame({
    'Attack Class': class_labels,
    'LDA': [lda_mcc_class[c] for c in class_labels],
    'Logistic Reg': [lr_mcc_class[c] for c in class_labels],
    'Ridge': [ridge_mcc_class[c] for c in class_labels]
})
print(mcc_per_class_df.to_string(index=False))


MCC PER ATTACK CLASS
Attack Class      LDA  Logistic Reg    Ridge
      benign 0.647364      0.729828 0.689253
         dos 0.787598      0.848406 0.780853
       probe 0.663910      0.801121 0.733110
         r2l 0.447782      0.549787 0.554618
         u2r 0.313794      0.440362 0.308046


In [28]:
# Bias-Variance Decomposition for all baselines
print("\n" + "="*70)
print("BIAS-VARIANCE DECOMPOSITION (5-fold bootstrap)")
print("="*70)

print("\nLDA:")
bias_var_metrics(X_train, X_test, y_train, y_test, LinearDiscriminantAnalysis(), folds=5)

print("\nLogistic Regression:")
bias_var_metrics(X_train, X_test, y_train, y_test, 
                 LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42), folds=5)

print("\nRidge Classifier:")
bias_var_metrics(X_train, X_test, y_train, y_test, 
                 RidgeClassifier(class_weight='balanced', random_state=42), folds=5)


BIAS-VARIANCE DECOMPOSITION (5-fold bootstrap)

LDA:
   Average bias: 0.241
   Average variance: 0.010
   Average expected loss: 0.238  "Goodness": 0.762


Logistic Regression:


KeyboardInterrupt: 

In [None]:
# Select best baseline based on MCC (most appropriate for imbalanced data)
best_baseline_name = baseline_comparison.loc[baseline_comparison['MCC'].idxmax(), 'Algorithm']
best_baseline_mcc = baseline_comparison['MCC'].max()

print("\n" + "="*70)
print("BEST BASELINE SELECTION")
print("="*70)
print(f"\nBest Baseline: {best_baseline_name}")
print(f"MCC Score: {best_baseline_mcc:.4f}")
print("\nJustification: MCC is selected as the primary metric because it is")
print("more informative for imbalanced datasets (Chicco & Jurman, 2020).")
print("U2R class has only 0.09% samples, making accuracy misleading.")

---
## 5. OPTIMISATION STRATEGY 1: Hyperparameter Tuning

### Hyperparameter Justification Table

| Parameter | Values Tested | Justification | Reference |
|-----------|---------------|---------------|-----------|
| solver | svd, lsqr, eigen | SVD is stable for most cases; lsqr/eigen allow shrinkage | Hastie et al. (2009) |
| shrinkage | None, auto, 0.1, 0.5, 0.9 | Regularization to prevent overfitting on high-dim data | Ledoit & Wolf (2004) |
| tol | 1e-4 | Convergence threshold | scikit-learn defaults |

**References:**
- Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical Learning.
- Ledoit, O., & Wolf, M. (2004). A well-conditioned estimator for large-dimensional covariance matrices.

In [None]:
print("="*60)
print("HYPERPARAMETER TUNING: LDA")
print("="*60)

# Define configurations to test
# Note: shrinkage only works with 'lsqr' or 'eigen' solvers
configs = [
    {'solver': 'svd', 'shrinkage': None},
    {'solver': 'lsqr', 'shrinkage': None},
    {'solver': 'lsqr', 'shrinkage': 'auto'},
    {'solver': 'lsqr', 'shrinkage': 0.1},
    {'solver': 'lsqr', 'shrinkage': 0.5},
    {'solver': 'lsqr', 'shrinkage': 0.9},
    {'solver': 'eigen', 'shrinkage': None},
    {'solver': 'eigen', 'shrinkage': 'auto'},
    {'solver': 'eigen', 'shrinkage': 0.1},
    {'solver': 'eigen', 'shrinkage': 0.5},
]

print("Testing configurations with 5-fold Stratified Cross-Validation...\n")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tuning_results = []
for config in configs:
    try:
        model = LinearDiscriminantAnalysis(**config)
        # Use MCC as scoring metric (requires custom scorer for multi-class)
        scores = cross_val_score(model, X_train, y_train, cv=skf, 
                                scoring='f1_weighted', n_jobs=-1)
        tuning_results.append({
            'config': config,
            'mean_score': scores.mean(),
            'std_score': scores.std()
        })
        print(f"{config} -> F1: {scores.mean():.4f} (+/- {scores.std():.4f})")
    except Exception as e:
        print(f"{config} -> Error: {e}")

In [None]:
# Find best configuration
best_result = max(tuning_results, key=lambda x: x['mean_score'])
print(f"\nBest Configuration: {best_result['config']}")
print(f"Best CV F1 Score: {best_result['mean_score']:.4f} (+/- {best_result['std_score']:.4f})")

---
## 6. OPTIMISATION STRATEGY 2: Feature Selection via Correlation Analysis

Feature selection reduces dimensionality and can improve model performance by removing noisy features.

**Method:** Correlation-based feature filtering
- Calculate absolute correlation between each feature and target
- Select features with correlation above threshold

**Reference:** Chandrashekar, G., & Sahin, F. (2014). A survey on feature selection methods.

In [None]:
# Encode target for correlation analysis
y_encoded = LabelEncoder().fit_transform(y_train)

# Create dataframe with features and encoded target
corr_df = X_train.copy()
corr_df['target'] = y_encoded

# Calculate correlation with target
correlations = corr_df.corr()['target'].drop('target').abs().sort_values(ascending=False)
print("Top 20 features correlated with target:")
print(correlations.head(20))

In [None]:
# Visualize top correlations (for appendix)
plt.figure(figsize=(12, 8))
top_features = correlations.head(25)
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title('Top 25 Features by Correlation with Target')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.savefig('../figures/linear_feature_correlation.png', dpi=150)
plt.show()

In [None]:
# Select features with correlation > threshold
threshold = 0.1
selected_features = correlations[correlations > threshold].index.tolist()
print(f"\nFeature Selection Results:")
print(f"  - Original features: {X_train.shape[1]}")
print(f"  - Selected features: {len(selected_features)}")
print(f"  - Reduction: {((X_train.shape[1] - len(selected_features)) / X_train.shape[1] * 100):.1f}%")
print(f"  - Threshold: {threshold}")

In [None]:
# Create reduced datasets
X_train_reduced = X_train[selected_features]
X_test_reduced = X_test[selected_features]
print(f"Reduced feature set: {X_train_reduced.shape[1]} features")

---
## 7. OPTIMISED MODEL

In [None]:
# Create optimised model with best parameters and reduced features
optimised_model = LinearDiscriminantAnalysis(**best_result['config'])

print("="*60)
print("OPTIMISED MODEL EVALUATION")
print("="*60)
print(f"Parameters: {best_result['config']}")
print(f"Features: {len(selected_features)} (reduced from {X_train.shape[1]})")

trs = time()
optimised_model.fit(X_train_reduced, y_train)
y_pred_optimised = optimised_model.predict(X_test_reduced)
opt_train_time = time() - trs

print(f"\nTraining Time: {opt_train_time:.2f} seconds\n")
show_metrics(y_test, y_pred_optimised, class_labels)

In [None]:
# Bias-Variance Decomposition for optimised model
print("\nBias-Variance Decomposition (Optimised):")
bias_var_metrics(X_train_reduced, X_test_reduced, y_train, y_test, 
                 LinearDiscriminantAnalysis(**best_result['config']), folds=5)

In [None]:
# Store optimised metrics
optimised_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_optimised),
    'f1_weighted': f1_score(y_test, y_pred_optimised, average='weighted'),
    'f1_macro': f1_score(y_test, y_pred_optimised, average='macro'),
    'mcc': matthews_corrcoef(y_test, y_pred_optimised),
    'train_time': opt_train_time
}
print("Optimised Metrics:", optimised_metrics)

# MCC per class for optimised model
opt_mcc_class = calculate_mcc_per_class(y_test, y_pred_optimised, class_labels)
print("\nMCC per class (Optimised):")
for cls, mcc in opt_mcc_class.items():
    print(f"  {cls}: {mcc:.4f}")

---
## 8. COMPARISON: Baseline vs Optimised Model

In [None]:
# Use LDA baseline for comparison (as it's our selected algorithm)
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 (Weighted)', 'F1 (Macro)', 'MCC', 'Train Time (s)'],
    'Baseline': [lda_metrics['accuracy'], lda_metrics['f1_weighted'], 
                 lda_metrics['f1_macro'], lda_metrics['mcc'], lda_metrics['train_time']],
    'Optimised': [optimised_metrics['accuracy'], optimised_metrics['f1_weighted'],
                  optimised_metrics['f1_macro'], optimised_metrics['mcc'], 
                  optimised_metrics['train_time']]
})
comparison_df['Improvement'] = comparison_df['Optimised'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline'] * 100).round(2)

print("\n" + "="*60)
print("PERFORMANCE COMPARISON: BASELINE vs OPTIMISED")
print("="*60)
print(comparison_df.to_string(index=False))

In [None]:
# MCC per class comparison
mcc_comparison_df = pd.DataFrame({
    'Attack Class': class_labels,
    'Baseline': [lda_mcc_class[c] for c in class_labels],
    'Optimised': [opt_mcc_class[c] for c in class_labels]
})
mcc_comparison_df['Improvement'] = mcc_comparison_df['Optimised'] - mcc_comparison_df['Baseline']

print("\n" + "="*60)
print("MCC PER CLASS: BASELINE vs OPTIMISED")
print("="*60)
print(mcc_comparison_df.to_string(index=False))

In [None]:
# Visualize comparison (for appendix)
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df['Metric']) - 1)  # Exclude train time
width = 0.35

metrics_to_plot = comparison_df[comparison_df['Metric'] != 'Train Time (s)']
bars1 = ax.bar(x - width/2, metrics_to_plot['Baseline'], width, label='Baseline', color='steelblue')
bars2 = ax.bar(x + width/2, metrics_to_plot['Optimised'], width, label='Optimised', color='darkorange')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Linear Discriminant Analysis: Baseline vs Optimised')
ax.set_xticks(x)
ax.set_xticklabels(metrics_to_plot['Metric'])
ax.legend()
ax.set_ylim(0, 1.1)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('../figures/linear_baseline_vs_optimised.png', dpi=150)
plt.show()

In [None]:
# Confusion Matrix Comparison (for appendix)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Baseline confusion matrix
cm_baseline = confusion_matrix(y_test, y_pred_lda, labels=class_labels)
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_baseline, display_labels=class_labels)
disp1.plot(ax=axes[0], cmap='Blues', values_format='d')
axes[0].set_title('Baseline Model')

# Optimised confusion matrix
cm_optimised = confusion_matrix(y_test, y_pred_optimised, labels=class_labels)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_optimised, display_labels=class_labels)
disp2.plot(ax=axes[1], cmap='Oranges', values_format='d')
axes[1].set_title('Optimised Model')

plt.tight_layout()
plt.savefig('../figures/linear_confusion_matrices.png', dpi=150)
plt.show()

In [None]:
# Print confusion matrix as table (for main report)
print("\nConfusion Matrix - Baseline (Table format):")
cm_baseline_df = pd.DataFrame(cm_baseline, index=class_labels, columns=class_labels)
print(cm_baseline_df)

print("\nConfusion Matrix - Optimised (Table format):")
cm_optimised_df = pd.DataFrame(cm_optimised, index=class_labels, columns=class_labels)
print(cm_optimised_df)

---
## 9. Summary and Conclusions

In [None]:
print("="*70)
print("SUMMARY: LINEAR CLASSIFIER FOR INTRUSION DETECTION")
print("="*70)

print("\n1. CLASSIFIER CATEGORY: Linear")
print("   Algorithms Evaluated: LDA, Logistic Regression, Ridge Classifier")
print("   Best Baseline: Linear Discriminant Analysis (LDA)")

print("\n2. CLASSIFICATION TYPE: Multi-class (5 categories)")
print("   Classes: benign, dos, probe, r2l, u2r")

print("\n3. BASELINE COMPARISON:")
print(baseline_comparison.to_string(index=False))

print("\n4. OPTIMISATION STRATEGIES APPLIED:")
print("   a) Hyperparameter Tuning with 5-fold Cross-Validation")
print(f"      - Best solver: {best_result['config']['solver']}")
print(f"      - Best shrinkage: {best_result['config'].get('shrinkage', 'None')}")
print("   b) Feature Selection via Correlation Analysis")
print(f"      - Original features: {X_train.shape[1]}")
print(f"      - Selected features: {len(selected_features)}")
print(f"      - Feature reduction: {((X_train.shape[1] - len(selected_features)) / X_train.shape[1] * 100):.1f}%")

print("\n5. PERFORMANCE IMPROVEMENT:")
for _, row in comparison_df.iterrows():
    if row['Metric'] != 'Train Time (s)':
        print(f"   {row['Metric']}: {row['Baseline']:.4f} -> {row['Optimised']:.4f} ({row['Improvement %']:+.2f}%)")

print("\n" + "="*70)

In [None]:
# Save results for group comparison
results_dict = {
    'classifier': 'Linear Discriminant Analysis',
    'category': 'Linear',
    'classification_type': 'multi-class',
    'classes': class_labels,
    'baseline_comparison': baseline_comparison.to_dict('records'),
    'baseline_metrics': lda_metrics,
    'optimised_metrics': optimised_metrics,
    'baseline_mcc_per_class': lda_mcc_class,
    'optimised_mcc_per_class': opt_mcc_class,
    'optimisation_strategies': ['Hyperparameter Tuning', 'Feature Selection (Correlation)'],
    'best_params': best_result['config'],
    'n_features_original': X_train.shape[1],
    'n_features_selected': len(selected_features),
    'feature_reduction_pct': round((X_train.shape[1] - len(selected_features)) / X_train.shape[1] * 100, 1)
}

# Save to file
with open('../results/linear_lda_results.json', 'w') as f:
    json.dump(results_dict, f, indent=2, default=str)
print("Results saved to: results/linear_lda_results.json")

In [None]:
# Save comparison tables as CSV for report
baseline_comparison.to_csv('../results/linear_baseline_comparison.csv', index=False)
comparison_df.to_csv('../results/linear_baseline_vs_optimised.csv', index=False)
mcc_comparison_df.to_csv('../results/linear_mcc_per_class.csv', index=False)
print("CSV files saved for report tables.")