# Individual Assignment: Linear Classifier
## Network Intrusion Detection using Linear Discriminant Analysis (LDA)

**Classifier Category:** Linear  
**Algorithm:** Linear Discriminant Analysis  
**Dataset:** NSL-KDD (Boosted Train + Preprocessed Test)

---
## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import warnings
warnings.filterwarnings('ignore')

import os
data_path = 'data'



In [2]:
# Import helper functions
from helpers import show_labels_dist, show_metrics, bias_var_metrics

IndentationError: expected an indented block after 'if' statement on line 3 (2680926479.py, line 6)

---
## 2. Load Dataset

In [None]:
# Load Boosted Train and Preprocessed Test datasets
data_file = os.path.join(data_path, 'NSL_boosted-2.csv')
train_df = pd.read_csv(data_file)
print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))

data_file = os.path.join(data_path, 'NSL_ppTest.csv')
test_df = pd.read_csv(data_file)
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))

In [None]:
train_df.head()

In [None]:
train_df.info()

---
## 3. Data Preparation

In [None]:
# Check numeric features consistency
trnn = train_df.select_dtypes(include=['float64','int64']).columns
tstn = test_df.select_dtypes(include=['float64','int64']).columns
trndif = np.setdiff1d(trnn, tstn)
tstdif = np.setdiff1d(tstn, trnn)

print("Numeric features in train_set not in test_set: ", 'None' if len(trndif) == 0 else trndif)
print("Numeric features in test_set not in train_set: ", 'None' if len(tstdif) == 0 else tstdif)

In [None]:
# Check categorical features consistency
trnn = train_df.select_dtypes(include=['object']).columns
tstn = test_df.select_dtypes(include=['object']).columns
print("Categorical features in train:", trnn.tolist())
print("Categorical features in test:", tstn.tolist())

In [None]:
# Check for missing values
print('Missing Values - Train Set:', train_df.isnull().sum().sum())
print('Missing Values - Test Set:', test_df.isnull().sum().sum())

In [None]:
# Combine datasets for consistent preprocessing
combined_df = pd.concat([train_df, test_df])
print('Combined Dataset: {} rows, {} columns'.format(combined_df.shape[0], combined_df.shape[1]))

In [None]:
# Check label distributions
print("Label distribution in combined dataset:")
print(combined_df['label'].value_counts())
print("\nAttack category distribution:")
print(combined_df['atakcat'].value_counts())

In [None]:
# Set classification target (Two-class: normal vs attack)
twoclass = True

if twoclass:
    labels_df = combined_df['label'].copy()
    labels_df[labels_df != 'normal'] = 'attack'
else:
    labels_df = combined_df[['atakcat']].copy()
    labels_df.rename(columns={'atakcat':'label'}, inplace=True)
    labels_df = labels_df.squeeze('columns')

# Drop target features
combined_df.drop(['label'], axis=1, inplace=True)
combined_df.drop(['atakcat'], axis=1, inplace=True)

In [None]:
# One-Hot Encoding categorical features
categori = combined_df.select_dtypes(include=['object']).columns
category_cols = categori.tolist()
print("Categorical columns to encode:", category_cols)

In [None]:
features_df = pd.get_dummies(combined_df, columns=category_cols)
print('Features after encoding: {} columns'.format(features_df.shape[1]))

In [None]:
# Get numeric columns for scaling
numeri = combined_df.select_dtypes(include=['float64','int64']).columns
print("Numeric columns for scaling:", numeri.tolist())

In [None]:
# Restore train/test split
X_train = features_df.iloc[:len(train_df),:].copy()
X_train.reset_index(inplace=True, drop=True)
X_test = features_df.iloc[len(train_df):,:].copy()
X_test.reset_index(inplace=True, drop=True)

y_train = labels_df[:len(train_df)]
y_train.reset_index(inplace=True, drop=True)
y_test = labels_df[len(train_df):]
y_test.reset_index(inplace=True, drop=True)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
# Apply MinMaxScaler (fit on train, transform both)
from sklearn.preprocessing import MinMaxScaler

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))
    
    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))

print("Scaling completed using MinMaxScaler (0-1 range)")

In [None]:
# Save original datasets before optimization
X_train_original = X_train.copy()
X_test_original = X_test.copy()
y_train_original = y_train.copy()

---
## 4. BASELINE MODEL: Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create baseline model with default parameters
baseline_model = LinearDiscriminantAnalysis()
print("Baseline Model:", baseline_model)
print("\nDefault Parameters:", baseline_model.get_params())

In [None]:
# Show label distribution
show_labels_dist(X_train, X_test, y_train, y_test)

In [None]:
# Train and evaluate baseline model
print("="*60)
print("BASELINE MODEL EVALUATION")
print("="*60)

trs = time()
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
tre = time() - trs

print(f"Training Time: {tre:.2f} seconds\n")
show_metrics(y_test, y_pred_baseline, baseline_model.classes_)

In [None]:
# Bias-Variance Decomposition for baseline
print("\nBias-Variance Decomposition (Baseline):")
bias_var_metrics(X_train, X_test, y_train, y_test, LinearDiscriminantAnalysis(), folds=20)

In [None]:
# Store baseline metrics for comparison
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

baseline_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_baseline),
    'precision': precision_score(y_test, y_pred_baseline, pos_label='attack'),
    'recall': recall_score(y_test, y_pred_baseline, pos_label='attack'),
    'f1': f1_score(y_test, y_pred_baseline, pos_label='attack'),
    'mcc': matthews_corrcoef(y_test, y_pred_baseline)
}
print("Baseline Metrics Stored:", baseline_metrics)

---
## 5. OPTIMISATION STRATEGY 1: Hyperparameter Tuning with Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define parameter grid for LDA
param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto', 0.1, 0.5, 0.9],  # Only for lsqr and eigen
}

# Note: shrinkage only works with 'lsqr' or 'eigen' solvers
# We'll do a more careful grid search

print("Hyperparameter Grid Search for LDA")
print("Parameters to tune: solver, shrinkage")

In [None]:
# Grid search with different configurations
from sklearn.model_selection import cross_val_score

results = []
configs = [
    {'solver': 'svd', 'shrinkage': None},
    {'solver': 'lsqr', 'shrinkage': None},
    {'solver': 'lsqr', 'shrinkage': 'auto'},
    {'solver': 'lsqr', 'shrinkage': 0.1},
    {'solver': 'lsqr', 'shrinkage': 0.5},
    {'solver': 'lsqr', 'shrinkage': 0.9},
    {'solver': 'eigen', 'shrinkage': None},
    {'solver': 'eigen', 'shrinkage': 'auto'},
    {'solver': 'eigen', 'shrinkage': 0.1},
    {'solver': 'eigen', 'shrinkage': 0.5},
]

print("Testing configurations with 5-fold Cross-Validation...\n")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for config in configs:
    try:
        model = LinearDiscriminantAnalysis(**config)
        scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_weighted', n_jobs=-1)
        results.append({
            'config': config,
            'mean_score': scores.mean(),
            'std_score': scores.std()
        })
        print(f"{config} -> F1: {scores.mean():.4f} (+/- {scores.std():.4f})")
    except Exception as e:
        print(f"{config} -> Error: {e}")

In [None]:
# Find best configuration
best_result = max(results, key=lambda x: x['mean_score'])
print(f"\nBest Configuration: {best_result['config']}")
print(f"Best CV F1 Score: {best_result['mean_score']:.4f}")

---
## 6. OPTIMISATION STRATEGY 2: Feature Selection via Correlation Analysis

In [None]:
# Compute correlation matrix for numeric features
from sklearn.preprocessing import LabelEncoder

# Encode target for correlation
y_encoded = LabelEncoder().fit_transform(y_train)

# Create dataframe with features and encoded target
corr_df = X_train.copy()
corr_df['target'] = y_encoded

# Calculate correlation with target
correlations = corr_df.corr()['target'].drop('target').abs().sort_values(ascending=False)
print("Top 20 features correlated with target:")
print(correlations.head(20))

In [None]:
# Visualize top correlations
plt.figure(figsize=(12, 8))
top_features = correlations.head(25)
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title('Top 25 Features by Correlation with Target')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.show()

In [None]:
# Select top correlated features (threshold > 0.1)
threshold = 0.1
selected_features = correlations[correlations > threshold].index.tolist()
print(f"\nSelected {len(selected_features)} features with correlation > {threshold}")
print(selected_features[:10], "...")

In [None]:
# Create reduced datasets
X_train_reduced = X_train[selected_features]
X_test_reduced = X_test[selected_features]
print(f"Reduced feature set: {X_train_reduced.shape[1]} features")

---
## 7. OPTIMISED MODEL

In [None]:
# Create optimised model with best parameters and reduced features
optimised_model = LinearDiscriminantAnalysis(**best_result['config'])

print("="*60)
print("OPTIMISED MODEL EVALUATION")
print("="*60)
print(f"Parameters: {best_result['config']}")
print(f"Features: {len(selected_features)} (reduced from {X_train.shape[1]})")

trs = time()
optimised_model.fit(X_train_reduced, y_train)
y_pred_optimised = optimised_model.predict(X_test_reduced)
tre = time() - trs

print(f"\nTraining Time: {tre:.2f} seconds\n")
show_metrics(y_test, y_pred_optimised, optimised_model.classes_)

In [None]:
# Bias-Variance Decomposition for optimised model
print("\nBias-Variance Decomposition (Optimised):")
bias_var_metrics(X_train_reduced, X_test_reduced, y_train, y_test, 
                 LinearDiscriminantAnalysis(**best_result['config']), folds=20)

In [None]:
# Store optimised metrics
optimised_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_optimised),
    'precision': precision_score(y_test, y_pred_optimised, pos_label='attack'),
    'recall': recall_score(y_test, y_pred_optimised, pos_label='attack'),
    'f1': f1_score(y_test, y_pred_optimised, pos_label='attack'),
    'mcc': matthews_corrcoef(y_test, y_pred_optimised)
}
print("Optimised Metrics:", optimised_metrics)

---
## 8. COMPARISON: Baseline vs Optimised Model

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'MCC'],
    'Baseline': [baseline_metrics['accuracy'], baseline_metrics['precision'], 
                 baseline_metrics['recall'], baseline_metrics['f1'], baseline_metrics['mcc']],
    'Optimised': [optimised_metrics['accuracy'], optimised_metrics['precision'],
                  optimised_metrics['recall'], optimised_metrics['f1'], optimised_metrics['mcc']]
})
comparison_df['Improvement'] = comparison_df['Optimised'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline'] * 100).round(2)

print("\n" + "="*60)
print("PERFORMANCE COMPARISON: BASELINE vs OPTIMISED")
print("="*60)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df['Metric']))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Baseline'], width, label='Baseline', color='steelblue')
bars2 = ax.bar(x + width/2, comparison_df['Optimised'], width, label='Optimised', color='darkorange')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Linear Discriminant Analysis: Baseline vs Optimised')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Metric'])
ax.legend()
ax.set_ylim(0, 1.1)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Confusion Matrix Comparison
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Baseline confusion matrix
cm_baseline = confusion_matrix(y_test, y_pred_baseline, labels=baseline_model.classes_)
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_baseline, display_labels=baseline_model.classes_)
disp1.plot(ax=axes[0], cmap='Blues')
axes[0].set_title('Baseline Model')

# Optimised confusion matrix
cm_optimised = confusion_matrix(y_test, y_pred_optimised, labels=optimised_model.classes_)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_optimised, display_labels=optimised_model.classes_)
disp2.plot(ax=axes[1], cmap='Oranges')
axes[1].set_title('Optimised Model')

plt.tight_layout()
plt.show()

In [None]:
# ROC Curve Comparison
from sklearn.metrics import roc_curve, auc

# Get probabilities
y_prob_baseline = baseline_model.predict_proba(X_test)[:, 1]
y_prob_optimised = optimised_model.predict_proba(X_test_reduced)[:, 1]

# Convert labels to binary
y_test_binary = (y_test == 'attack').astype(int)

# Calculate ROC curves
fpr_base, tpr_base, _ = roc_curve(y_test_binary, y_prob_baseline)
fpr_opt, tpr_opt, _ = roc_curve(y_test_binary, y_prob_optimised)

auc_base = auc(fpr_base, tpr_base)
auc_opt = auc(fpr_opt, tpr_opt)

plt.figure(figsize=(8, 6))
plt.plot(fpr_base, tpr_base, 'b-', label=f'Baseline (AUC = {auc_base:.4f})')
plt.plot(fpr_opt, tpr_opt, 'r-', label=f'Optimised (AUC = {auc_opt:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison - Linear Discriminant Analysis')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

---
## 9. Summary and Conclusions

In [None]:
print("="*70)
print("SUMMARY: LINEAR DISCRIMINANT ANALYSIS FOR INTRUSION DETECTION")
print("="*70)
print("\n1. CLASSIFIER CATEGORY: Linear")
print("   Algorithm: Linear Discriminant Analysis (LDA)")
print("\n2. OPTIMISATION STRATEGIES APPLIED:")
print("   a) Hyperparameter Tuning with Cross-Validation")
print(f"      - Best solver: {best_result['config']['solver']}")
print(f"      - Best shrinkage: {best_result['config']['shrinkage']}")
print("   b) Feature Selection via Correlation Analysis")
print(f"      - Original features: {X_train.shape[1]}")
print(f"      - Selected features: {len(selected_features)}")
print(f"      - Feature reduction: {((X_train.shape[1] - len(selected_features)) / X_train.shape[1] * 100):.1f}%")
print("\n3. PERFORMANCE IMPROVEMENT:")
for _, row in comparison_df.iterrows():
    print(f"   {row['Metric']}: {row['Baseline']:.4f} -> {row['Optimised']:.4f} ({row['Improvement %']:+.2f}%)")
print(f"\n4. ROC-AUC: {auc_base:.4f} -> {auc_opt:.4f}")
print("\n" + "="*70)

In [None]:
# Save results for group comparison
results_dict = {
    'classifier': 'Linear Discriminant Analysis',
    'category': 'Linear',
    'baseline_metrics': baseline_metrics,
    'optimised_metrics': optimised_metrics,
    'baseline_auc': auc_base,
    'optimised_auc': auc_opt,
    'optimisation_strategies': ['Hyperparameter Tuning', 'Feature Selection (Correlation)'],
    'best_params': best_result['config'],
    'n_features_original': X_train.shape[1],
    'n_features_selected': len(selected_features)
}

# Save to file
import json
with open('results/linear_lda_results.json', 'w') as f:
    json.dump(results_dict, f, indent=2)
print("Results saved to: results/linear_lda_results.json")