### **IMPORTS**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, mean_squared_error

import kagglehub
from kagglehub import KaggleDatasetAdapter

## **DATA LOADING**


In [None]:
# Set the path to the file you'd like to load
file_path = "lung_cancer_dataset.csv"

# Load the latest version
df = kagglehub.dataset_load(
   KaggleDatasetAdapter.PANDAS,
  "mikeytracegod/lung-cancer-risk-dataset",
  path = file_path
)

df.head(3)

## **DATA WRANGLING**

First, we need to do some quick preprocessing.

In [None]:
# Convert categorical variables to quantitative
df['lung_cancer'] = df['lung_cancer'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['radon_exposure'] = df['radon_exposure'].map({'Low': 0, 'Medium': 1, 'High': 2})
df['asbestos_exposure'] = df['asbestos_exposure'].map({'Yes': 1, 'No': 0})
df['secondhand_smoke_exposure'] = df['secondhand_smoke_exposure'].map({'Yes': 1, 'No': 0})
df['copd_diagnosis'] = df['copd_diagnosis'].map({'Yes': 1, 'No': 0})
df['alcohol_consumption'] = df['alcohol_consumption'].map({np.nan: 0, 'Moderate': 1, 'Heavy': 2})
df['family_history'] = df['family_history'].map({'Yes': 1, 'No': 0})

df.head(3)

Now, we will do our train/test split and build the model.

In [None]:
# Make features every variable except our response (lung_cancer)
features = [col for col in df.columns if col != "lung_cancer" and col != "patient_id"]

# Train/test split
train_x, test_x, train_y, test_y = train_test_split(df[features], df["lung_cancer"], random_state=1)

# Building and fitting model
clf = RandomForestClassifier(n_estimators=200, random_state=1)
mymodel = clf.fit(train_x, train_y)

## **PERMUTATION IMPORTANCE (PIMP)**

Before we calculate the permutation importance for every feature, I want to show how it works manually for one feature.

In [None]:
base_acc = accuracy_score(test_y, clf.predict(test_x))
base_acc

In [None]:
# Make a copy
permuted_x = test_x.copy()
permuted_x.head(3)

In [None]:
# Shuffle the 'pack years' feature
np.random.shuffle(permuted_x['pack_years'].values)

permuted_x.head(3)

In [None]:
# Get accuracy after shuffling
permuted_acc = accuracy_score(test_y, clf.predict(permuted_x))
print(f"Permuted accuracy: {permuted_acc}")
# Calculate importance (drop in accuracy)
importance = base_acc - permuted_acc
print(f"Permutation importance: {importance:.4f}")

Now, we'll run the function to compute the importance for every feature.

In [None]:
# Baseline performance
y_pred = clf.predict(test_x)
baseline_acc = accuracy_score(test_y, y_pred)
print(f"Baseline accuracy: {baseline_acc:.3f}")

# Permutation importance
perm = permutation_importance(clf, test_x, test_y, n_repeats=3, scoring='accuracy')
imp_df = pd.DataFrame({
    'feature': test_x.columns,
    'importance_mean': perm.importances_mean,
    'importance_std': perm.importances_std
}).sort_values('importance_mean', ascending=False)

imp_df.head(3)

In [None]:
# Make gradient for our table to see better
red_white_green = LinearSegmentedColormap.from_list("red_white_green", ["red", "white", "green"])

styled_df = imp_df.style.background_gradient(
    subset=["importance_mean"],
    cmap=red_white_green,
    vmin=-abs(imp_df["importance_mean"]).max(),
    vmax=abs(imp_df["importance_mean"]).max()
)
styled_df

The features toward the top are the most important and those toward the bottom are the least important. The column on the right measures how performance varied from one shuffle to the next when repeating. The negative values mean that the shuffled predictions happened to be more accurate than the real data, usually because that feature was not important.