<a href="https://colab.research.google.com/github/prksh830/Healthcare/blob/main/oos_gnn(0203).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load dataset
data = pd.read_csv('/mnt/data/wsn-ds.csv')

# Show class distribution before SMOTE
print("Class distribution before SMOTE:")
print(data['Attack_type'].value_counts())

# Bar chart before SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x='Attack_type', data=data)
plt.title('Class Distribution Before SMOTE')
plt.show()

# Prepare features and target
X = data.drop('Attack_type', axis=1)
y = data['Attack_type']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split BEFORE SMOTE
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=42)

# Confusion matrices before SMOTE using RandomForest
rf_before = RandomForestClassifier(n_estimators=100, random_state=42)
rf_before.fit(X_train_orig, y_train_orig)
y_train_pred_orig = rf_before.predict(X_train_orig)
y_test_pred_orig = rf_before.predict(X_test_orig)

print("\nConfusion Matrix - Training (Before SMOTE):")
print(confusion_matrix(y_train_orig, y_train_pred_orig))

print("\nConfusion Matrix - Testing (Before SMOTE):")
print(confusion_matrix(y_test_orig, y_test_pred_orig))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Show class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(Counter(y_resampled))

# Bar chart after SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x=y_resampled)
plt.title('Class Distribution After SMOTE')
plt.show()

# Split AFTER SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define individual classifiers for the OOS-GNN ensemble
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
clf3 = KNeighborsClassifier(n_neighbors=5)

# List individual classifiers
print("\nIndividual classifiers used in the OOS-GNN ensemble:")
print("- Random Forest")
print("- Gradient Boosting")
print("- K-Nearest Neighbors")

# Create the ensemble (OOS-GNN Mock)
oos_gnn = VotingClassifier(estimators=[
    ('rf', clf1),
    ('gb', clf2),
    ('knn', clf3)
], voting='soft', n_jobs=-1)

# Fit the ensemble
start_time = time.time()
oos_gnn.fit(X_train, y_train)
train_time = time.time() - start_time

# Training results
y_train_pred = oos_gnn.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_cm = confusion_matrix(y_train, y_train_pred)

# Testing results
start_time = time.time()
y_test_pred = oos_gnn.predict(X_test)
test_time = time.time() - start_time
test_accuracy = accuracy_score(y_test, y_test_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

print(f"\nOOS-GNN Training Accuracy: {train_accuracy:.4f}")
print(f"OOS-GNN Testing Accuracy: {test_accuracy:.4f}")
print(f"Training Time: {train_time:.2f}s, Testing Time: {test_time:.2f}s")

print("\nConfusion Matrix - Training (OOS-GNN):")
print(train_cm)

print("\nConfusion Matrix - Testing (OOS-GNN):")
print(test_cm)

# Comparison with other methods
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(eval_metric='mlogloss')
}

accuracies = {}
times = {}
rocs = {}

for name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    y_proba = model.predict_proba(X_test)
    roc = roc_auc_score(pd.get_dummies(y_test), y_proba, multi_class='ovr')

    accuracies[name] = acc
    times[name] = elapsed_time
    rocs[name] = roc
    print(f"{name} Accuracy: {acc:.4f}, Time: {elapsed_time:.2f}s, ROC AUC: {roc:.4f}")
    print(f"Confusion Matrix - {name}:\n{confusion_matrix(y_test, y_pred)}\n")

# ROC Comparison Plot
plt.figure(figsize=(10, 6))
for name, model in models.items():
    y_proba = model.predict_proba(X_test)
    fpr, tpr, _ = roc_curve(pd.get_dummies(y_test).values.ravel(), y_proba.ravel())
    plt.plot(fpr, tpr, label=f"{name} (AUC = {rocs[name]:.2f})")
plt.title('ROC Curve Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Bar chart of accuracies
plt.figure(figsize=(8, 6))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
plt.title('Classifier Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.show()
