<a href="https://colab.research.google.com/github/sheelapravalika/AI_CS_mokshasaireddy005/blob/main/Untitled10_fixed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sheelapravalika/AI_CS_mokshasaireddy005/blob/main/Untitled10.ipynb)

In [None]:
!pip install -U datasets huggingface_hub fsspec pandas scikit-learn matplotlib seaborn numpy sentence-transformers xgboost

from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from scipy.stats import mode
import warnings
warnings.filterwarnings('ignore')

Collecting fsspec
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)


In [None]:
#✅ STEP 2: Load and Prepare Data

# Load dataset
dataset = load_dataset("sonnh-tech1/cic-ids-2017", "binary")
df = dataset["train"].to_pandas()

# Encode label
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

# Numeric features only
X = df.select_dtypes(include=['int64', 'float64']).drop(columns=['Label'])
y = df['Label']

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Add Gaussian noise
X_scaled += np.random.normal(0, 0.01, X_scaled.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)
#

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/92.8M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/108M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2522362 [00:00<?, ? examples/s]

In [None]:
#✅ STEP 3: Model Definitions (Weakened)

# Random Forest (simplified)
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=4,
    min_samples_leaf=30,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# ANN (simplified)
ann = MLPClassifier(
    hidden_layer_sizes=(16,),
    activation='relu',
    max_iter=20,
    solver='adam',
    random_state=42
)

# XGBoost (simplified)
gb = XGBClassifier(
    n_estimators=3,
    max_depth=1,
    learning_rate=0.1,
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)



NameError: name 'RandomForestClassifier' is not defined

In [None]:
#✅ STEP 4: SLM (Sentence Embedding + Logistic Regression)

# Convert X_scaled into sentence-style text
text_data = pd.DataFrame(X_scaled).astype(str).agg(' '.join, axis=1)

# Sentence Embedding
model_slm = SentenceTransformer('paraphrase-MiniLM-L3-v2')
X_embed = model_slm.encode(text_data.tolist(), show_progress_bar=True)

# SLM split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_embed, y, test_size=0.2, stratify=y, random_state=42
)

# Lightweight classifier
slm_model = LogisticRegression(max_iter=100, class_weight='balanced')
slm_model.fit(X_train_s, y_train_s)

In [None]:
#✅ STEP 5: Train All Models

rf.fit(X_train, y_train)
ann.fit(X_train, y_train)
gb.fit(X_train, y_train)
slm_model.fit(X_train_s, y_train_s)

In [None]:
#✅ STEP 6: Predict and Fuse

rf_pred = rf.predict(X_test)
ann_pred = ann.predict(X_test)
gb_pred = gb.predict(X_test)
slm_pred = slm_model.predict(X_test_s)

# Stack predictions
all_preds = np.vstack([rf_pred, ann_pred, gb_pred, slm_pred])

# Majority vote
fusion_pred, _ = mode(all_preds, axis=0)
fusion_pred = fusion_pred.flatten()


In [None]:
acc = accuracy_score(y_test, fusion_pred)
f1 = f1_score(y_test, fusion_pred, average='weighted')

print(f"✅ Fusion Accuracy: {acc:.4f}")
print(f"✅ Weighted F1 Score: {f1:.4f}\n")

print("📄 Classification Report:\n", classification_report(y_test, fusion_pred))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, fusion_pred))

# Plot
conf_mat = confusion_matrix(y_test, fusion_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Fusion Model Confusion Matrix")
plt.show()


In [None]:
import nbformat

# Step 1: Upload your broken notebook again
from google.colab import files
uploaded = files.upload()

# Replace this with the actual uploaded filename
filename = list(uploaded.keys())[0]

# Step 2: Load and clean
with open(filename, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

# ✅ Step 3: Clean ONLY metadata.widgets (KEEP outputs and execution_count)
for cell in nb.cells:
    if 'metadata' in cell:
        cell['metadata'] = {k: v for k, v in cell['metadata'].items() if k != 'widgets'}

# ✅ Step 4: Clear notebook-level metadata.widgets (if present)
if 'widgets' in nb.get('metadata', {}):
    del nb['metadata']['widgets']

# Step 5: Save cleaned notebook
cleaned_filename = filename.replace('.ipynb', '_fixed.ipynb')
with open(cleaned_filename, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print(f"✅ Cleaned notebook saved (outputs preserved) as {cleaned_filename}")
files.download(cleaned_filename)