DATASET LINK-https://drive.google.com/drive/folders/1fcTn5rc-CcqX40JzVq4qwPkeRo3_fzGa?usp=sharing

### 1)Perform a counterfactual explanation on the Iris dataset. Analyze how the model’sprediction would change by modifying certain feature values and explain whichfeature alterations have the most significant impact on the outcome

Install Required Libraries

In [None]:
#!pip install scikit-learn pandas matplotlib seaborn --quiet


Load and Preprocess the Dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('Iris.csv')
df = df.drop(columns=['Id'])

# Encode species
le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

df.head()


Train a Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.drop('Species', axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


Choose a Sample and Predict Original Class

In [None]:
# Choose one instance to explain
query_instance = X_test.iloc[[0]]
original_pred = model.predict(query_instance)[0]

print("Original Class:", original_pred, "-", le.inverse_transform([original_pred])[0])
query_instance


Generate Manual Counterfactuals (Systematic Perturbation)

In [None]:
import numpy as np

# Define steps for perturbation
steps = 20
feature_changes = {}

# Store counterfactual predictions
for feature in X.columns:
    values = np.linspace(X[feature].min(), X[feature].max(), steps)
    predictions = []

    for val in values:
        modified_instance = query_instance.copy()
        modified_instance[feature] = val
        pred = model.predict(modified_instance)[0]
        predictions.append(pred)
    
    feature_changes[feature] = {
        'values': values,
        'predictions': predictions
    }


Visualize Counterfactual Impact (Prediction vs. Feature Value)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot how prediction changes with each feature
plt.figure(figsize=(12, 8))
for i, (feature, data) in enumerate(feature_changes.items()):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(x=data['values'], y=data['predictions'])
    plt.title(f"{feature} vs. Predicted Class")
    plt.xlabel(feature)
    plt.ylabel("Predicted Class")

plt.tight_layout()
plt.show()


Identify Most Impactful Features

In [None]:
# Count how many times each feature caused a class change
impact_score = {}
for feature, data in feature_changes.items():
    transitions = sum(np.array(data['predictions']) != original_pred)
    impact_score[feature] = transitions

# Sort by impact
sorted_impact = dict(sorted(impact_score.items(), key=lambda x: x[1], reverse=True))

print("Feature Impact Scores (higher = more influence):")
for feature, score in sorted_impact.items():
    print(f"- {feature}: {score} transitions")


Visualize Most Influential Features (Bar Plot)

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(x=list(sorted_impact.keys()), y=list(sorted_impact.values()))
plt.title("Feature Impact on Class Change")
plt.xlabel("Feature")
plt.ylabel("Number of Prediction Changes")
plt.show()


Heatmap of Prediction Changes

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Create a matrix: features x steps
heatmap_data = []
for feature in X.columns:
    pred_labels = feature_changes[feature]['predictions']
    heatmap_data.append(pred_labels)

# Convert to numpy array
heatmap_array = np.array(heatmap_data)

# Plot
plt.figure(figsize=(10, 5))
sns.heatmap(heatmap_array, cmap="viridis", xticklabels=False, yticklabels=list(X.columns))
plt.title("Prediction Change Heatmap Across Feature Perturbations")
plt.xlabel("Steps")
plt.ylabel("Features")
plt.show()


Line Plot of Predictions per Feature

In [None]:
plt.figure(figsize=(12, 8))
for i, (feature, data) in enumerate(feature_changes.items()):
    plt.subplot(2, 2, i+1)
    plt.plot(data['values'], data['predictions'], marker='o')
    plt.title(f"{feature} Influence on Prediction")
    plt.xlabel(feature)
    plt.ylabel("Predicted Class")
    plt.grid(True)

plt.tight_layout()
plt.show()


Feature Perturbation Transition Steps Plot

In [None]:
# Track transition steps for each feature where prediction changes
transition_steps = []

for feature, data in feature_changes.items():
    preds = np.array(data['predictions'])
    original = preds[0]
    transition_point = np.where(preds != original)[0]
    if len(transition_point) > 0:
        transition_steps.append({'Feature': feature, 'Step': transition_point[0]})  # first transition

# Plotting the results using a DataFrame
if transition_steps:
    transition_df = pd.DataFrame(transition_steps)

    plt.figure(figsize=(8, 4))
    sns.barplot(data=transition_df,hue='Feature',x='Feature', y='Step', palette='coolwarm',legend=False)
    plt.title("Feature Perturbation Step Causing First Class Change")
    plt.ylabel("Step Index (Lower = More Sensitive)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No class transitions detected during feature perturbation.")


Parallel Coordinates Plot (Optional - Multifeature View)

In [None]:
from pandas.plotting import parallel_coordinates

# Combine and label original + a few variations
combined_df = query_instance.copy()
for feature in X.columns:
    alt = query_instance.copy()
    alt[feature] += (X[feature].max() - X[feature].min()) * 0.2  # 20% bump
    combined_df = pd.concat([combined_df, alt], axis=0)

# Add prediction labels for visualization
combined_df['Prediction'] = model.predict(combined_df)

# Convert numeric class to label
combined_df['Prediction'] = le.inverse_transform(combined_df['Prediction'])

# Add label for original vs modified
combined_df['Type'] = ['Original'] + ['Modified'] * (len(combined_df)-1)

# Parallel coordinates plot
plt.figure(figsize=(10, 6))
parallel_coordinates(combined_df.drop(columns=['Type']), class_column='Prediction', color=('#1f77b4', '#ff7f0e', '#2ca02c'))
plt.title("Parallel Coordinates Plot: Feature Impact on Prediction")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Predictions on test set
y_pred = model.predict(X_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()


Feature Importance Plot

In [None]:
import numpy as np

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 4))
sns.barplot(x=[X.columns[i] for i in indices], y=importances[indices])
plt.title("Feature Importances (Random Forest)")
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()


Pair Plot (Colored by Predicted Class)

In [None]:
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

# Add predicted labels for visualization
viz_df = X_test.copy()
viz_df['True'] = le.inverse_transform(y_test)
viz_df['Predicted'] = le.inverse_transform(y_pred)

# Suppress layout warnings temporarily
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message=".*figure layout has changed to tight.*")

    # Create pairplot
    sns.set(style="ticks")
    g = sns.pairplot(viz_df, hue='Predicted', corner=True, palette='Set1')
    g.fig.suptitle("Pair Plot by Predicted Class", y=1.02)

    plt.show()


Decision Boundary (2D Projection)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Choose two features for 2D projection
feature1 = 'SepalLengthCm'  # Change as needed
feature2 = 'SepalWidthCm'   # Change as needed

# Create a mesh grid to evaluate model predictions over a 2D space
x_min, x_max = X[feature1].min() - 1, X[feature1].max() + 1
y_min, y_max = X[feature2].min() - 1, X[feature2].max() + 1

xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))

# Prepare the input for the model (take only the two features for prediction)
grid_points = np.c_[xx.ravel(), yy.ravel()]

# Create a DataFrame with the same structure as the training data
# Fill the other features (PetalLengthCm, PetalWidthCm) with their mean values
grid_df = pd.DataFrame(grid_points, columns=[feature1, feature2])
grid_df['PetalLengthCm'] = X['PetalLengthCm'].mean()
grid_df['PetalWidthCm'] = X['PetalWidthCm'].mean()

# Make predictions on the grid points
predictions = model.predict(grid_df)

# Reshape predictions to match the grid shape
Z = predictions.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)

# Plot the training points
sns.scatterplot(x=X[feature1], y=X[feature2], hue=y, palette="deep", s=100, edgecolor="k")

# Highlight the query instance
sns.scatterplot(x=query_instance[feature1], y=query_instance[feature2], color="black", marker="*", s=200, label="Query Instance")

# Labeling the plot
plt.title(f"Decision Boundary for {feature1} vs {feature2}")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.legend()
plt.tight_layout()
plt.show()


### 2)Conduct a feature importance analysis on the Diabetes dataset. Use an appropriatetechnique to evaluate the contribution of each feature to the model's predictions, andidentify which features are the most influential in determining the outcome.

Install necessary libraries

In [None]:
#!pip install pandas matplotlib seaborn scikit-learn --quiet


Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')


Load and Inspect the Dataset

In [None]:
# Load dataset
df = pd.read_csv('diabetes.csv')

# Display basic information
print(df.info())

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Display first few rows of the dataset
df.head()


Prepare Features and Target Variable

In [None]:
# Features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Train the Random Forest Model

In [None]:
# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


Feature Importance Visualization

In [None]:
# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualizing feature importances
feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df, palette='viridis')
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.show()


Display Top 3 Important Features

In [None]:
# Display top 3 important features
top_features = feature_imp_df.head(3)
top_features


Confusion Matrix and Classification Report

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
# Classification report
print(classification_report(y_test, y_pred))


Not Needed SHAP (Optional Advanced Feature Importance)

In [None]:
#!pip install shap

SHAP Analysis for Feature Contribution

Correlation Heatmap

In [None]:
# Correlation Heatmap to understand relationships between features
plt.figure(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()


Pairplot of Top 3 Important Features

In [None]:
# Extract the top 3 features
top_3_features = top_features['Feature'].tolist()

# Pairplot for top 3 features
sns.pairplot(df[top_3_features + ['Outcome']], hue='Outcome', palette='Set2')
plt.suptitle("Pairplot of Top 3 Features by Importance", y=1.02)
plt.show()


Boxplots of Top 3 Features

In [None]:
# Boxplots to visualize the distribution of top 3 features against the outcome
plt.figure(figsize=(15, 5))
for i, feature in enumerate(top_3_features):
    plt.subplot(1, 3, i+1)
    sns.boxplot(x='Outcome', y=feature, data=df, palette='Set1')
    plt.title(f'{feature} vs Outcome')
    plt.tight_layout()
plt.show()


Pairwise Correlation Matrix

In [None]:
# Create a pairwise correlation matrix to visualize relationships between all features
sns.pairplot(df, hue='Outcome', palette='Set2', plot_kws={'alpha': 0.7})
plt.suptitle("Pairwise Correlation Matrix", y=1.02)
plt.show()


 ROC Curve for Model Evaluation 

In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()


Feature Distribution for Top 3 Features

In [None]:
# Visualize distribution of top 3 features
plt.figure(figsize=(15, 5))
for i, feature in enumerate(top_3_features):
    plt.subplot(1, 3, i+1)
    sns.histplot(df[feature], kde=True, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
plt.show()
