In [None]:
# === A1 & A2: COMBINED STACKING CLASSIFIER AND PIPELINE IMPLEMENTATION ===
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np 

# --- 2. Data Loading and Splitting ---
try:
    df = pd.read_csv('embedded_dataset_deberta.csv')
except FileNotFoundError:
    print("Error: 'embedded_dataset_deberta.csv' not found. Using placeholder data for demonstration.")
  
    data = {f'feature_{i}': np.random.rand(200) for i in range(10)}
    data['label'] = np.random.randint(0, 3, 200)
    df = pd.DataFrame(data)

df = df.dropna(subset=['label'])
X = df.drop('label', axis=1)
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data variables (X_train, X_test, etc.) defined.")


# --- 3. A1: Stacking Classifier Training ---
# Define Base models
base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42))
]
meta_model = LogisticRegression(max_iter=500)

# Define and Train Stacking Classifier (stack_clf)
stack_clf = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, n_jobs=-1)

print("Training Stacking Classifier (A1)... This will take some time.")
stack_clf.fit(X_train, y_train) 

# Evaluate A1
y_pred_a1 = stack_clf.predict(X_test)
acc_a1 = accuracy_score(y_test, y_pred_a1)
print(f"A1 Stacking Classifier Accuracy: {acc_a1:.4f}")


# --- 4. A2: Pipeline Implementation and Training ---
# Build pipeline (Reuses the trained stack_clf object)
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Preprocessing step: scales the data
    ('stacking', stack_clf)        # Classifier step: the trained StackingClassifier
])

print("Training Pipeline (A2)...")
# Fit pipeline: This trains the StandardScaler on X_train and prepares the pipeline.
pipeline.fit(X_train, y_train)

# Predict using pipeline
pipeline_pred = pipeline.predict(X_test)

# Evaluate A2 (Final Accuracy with Scaling)
pipeline_acc = accuracy_score(y_test, pipeline_pred)
print(f"A2 Pipeline Accuracy (with Scaling): {pipeline_acc:.4f}")

Data variables (X_train, X_test, etc.) defined.
Training Stacking Classifier (A1)... This will take some time.
A1 Stacking Classifier Accuracy: 0.5732
Training Pipeline (A2)...


In [None]:
# === A3: LIME Explanation ===


import numpy as np
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt 

# 1. Create LIME explainer object
# LIME is initialized using the defined X_train to learn the feature distribution.
explainer = LimeTabularExplainer(
    training_data=np.array(X_train), 
    feature_names=X_train.columns.tolist(), 
    class_names=[str(c) for c in np.unique(y_train)],
    mode='classification'
)

# 2. Define the instance to explain (the first sample in the test set)
i = 0  # index of test sample
test_instance = X_test.iloc[i].to_numpy()

# 3. Generate the LIME explanation
# predict_fn takes the numpy array data and outputs probability scores
exp = explainer.explain_instance(
    data_row=test_instance,
    predict_fn=pipeline.predict_proba, 
    num_features=10 # Show the top 10 most influential features
)

# 4. Report the outcome and show the visual explanation
print("\n--- LIME EXPLANATION FOR TEST SAMPLE #0 ---")
# Predict the class for the report
predicted_class = pipeline.predict(X_test.iloc[i].to_frame().T)[0]
print(f"Predicted Class: {predicted_class}")
print(f"True Class: {y_test.iloc[i]}")


exp.show_in_notebook(show_table=True)


print("\nTop 5 Features Influencing Prediction:")
for feature_index, weight in exp.as_list()[:5]:
    print(f"- {feature_index}")