In [3]:
# ===============================
# LAB ASSIGNMENT 7-1
# Counterfactual Explanations
# ===============================

# STEP 1: Install DiCE
!pip install dice-ml --quiet

# STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import dice_ml

# ===============================
# PART 1: Load & Preprocess Dataset
# ===============================

# Load dataset (take only 200 rows for fast execution)
df = pd.read_csv("/content/drive/MyDrive/aiml projects/lung_cancer_dataset.csv").head(200)

print("Dataset Shape:", df.shape)
print(df.head())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Drop rows with missing values (or you can use fillna)
df = df.dropna()

# Encode categorical variables if any
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Separate features and target
X = df.drop(columns=['lung_cancer'])  # assuming 'LUNG_CANCER' is target
y = df['lung_cancer']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ===============================
# PART 2: Train Models
# ===============================

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, name):
    print(f"\n{name} Results:")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1-score :", f1_score(y_true, y_pred))

# Evaluate both models
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

# ===============================
# PART 3: Counterfactual Generation (DiCE)
# ===============================

# Prepare data for DiCE (using training data)
d = dice_ml.Data(dataframe=pd.concat([pd.DataFrame(X_train, columns=X.columns), y_train.reset_index(drop=True)], axis=1),
                 continuous_features=X.columns.tolist(),
                 outcome_name='lung_cancer')

# Wrap trained model for DiCE
m = dice_ml.Model(model=log_reg, backend="sklearn")

# Create DiCE explainer
exp = dice_ml.Dice(d, m)

# Pick one test instance that was predicted as negative (0)
test_instance = X_test[y_pred_lr == 0]
if len(test_instance) > 0:
    test_instance = pd.DataFrame(test_instance[0].reshape(1, -1), columns=X.columns)

    # Generate 3 counterfactuals
    dice_exp = exp.generate_counterfactuals(test_instance, total_CFs=3, desired_class=1)

    # Show results
    dice_exp.visualize_as_dataframe()
else:
    print("No negative predictions found in test set to generate counterfactuals.")

# ===============================
# PART 4: Reflection
# ===============================

print("\nAnalysis: Counterfactual explanations show minimal feature changes needed to flip the decision.")
print("These can guide doctors/patients to understand risk factors realistically.")
print("Changing distance metrics (Euclidean vs Manhattan) may affect which features change first.")


Dataset Shape: (200, 11)
   patient_id  age  gender  pack_years radon_exposure asbestos_exposure  \
0      100000   69    Male   66.025244           High                No   
1      100001   32  Female   12.780800           High                No   
2      100002   89  Female    0.408278         Medium               Yes   
3      100003   78  Female   44.065232            Low                No   
4      100004   38  Female   44.432440         Medium               Yes   

  secondhand_smoke_exposure copd_diagnosis alcohol_consumption family_history  \
0                        No            Yes            Moderate             No   
1                       Yes            Yes            Moderate            Yes   
2                       Yes            Yes                 NaN             No   
3                       Yes             No            Moderate             No   
4                        No            Yes                 NaN            Yes   

  lung_cancer  
0          No  
1    

100%|██████████| 1/1 [00:00<00:00,  6.51it/s]

Query instance (original outcome : 0)





Unnamed: 0,patient_id,age,gender,pack_years,radon_exposure,asbestos_exposure,secondhand_smoke_exposure,copd_diagnosis,alcohol_consumption,family_history,lung_cancer
0,-0.68077,-1.522492,-1.028991,-0.114418,1.166479,-0.917663,0.891556,-1.074172,-1.154701,-0.930949,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,patient_id,age,gender,pack_years,radon_exposure,asbestos_exposure,secondhand_smoke_exposure,copd_diagnosis,alcohol_consumption,family_history,lung_cancer
0,-0.68077,1.20923,-1.028992,-0.114418,0.14137,-0.917663,0.891556,-1.074172,-1.154701,-0.930949,1
1,-0.68077,1.280836,-1.028992,-0.114418,1.16648,-0.917663,0.891556,-1.074172,-1.154701,-0.930949,1
2,-0.68077,1.553506,-1.028992,-0.114418,1.16648,-0.917663,0.891556,-1.074172,-1.154701,-0.930949,1



Analysis: Counterfactual explanations show minimal feature changes needed to flip the decision.
These can guide doctors/patients to understand risk factors realistically.
Changing distance metrics (Euclidean vs Manhattan) may affect which features change first.
