# Human Machine Interaction & Bias Mitigation

## Imports

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

from tpot import TPOTClassifier
import shap
import lime
import lime.lime_tabular

from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds


## Data Loading and Initial Analysis
- Loading the dataset using Pandas and performing an initial analysis to understand the basic properties. 
- Using descriptive statistics and visualizations to identify distributions, detect missing values, and spot outliers.
- Identifying the datatypes of the columns

In [8]:
df = pd.read_csv('patient_data.csv')

In [9]:
df.head(10)

Unnamed: 0,rs1047763,rs9282541,rs3827760,rs4988235,rs1801133,rs9374842,BMI,CardiovascularDisease
0,0,0,0,0,0,0,28.607859,0
1,0,0,1,1,0,0,26.651948,0
2,1,1,1,0,0,1,31.885502,0
3,0,0,1,0,0,0,29.353686,0
4,1,1,0,0,0,0,33.630251,0
5,0,0,0,0,0,0,28.243031,0
6,1,0,1,0,0,0,21.634838,0
7,1,1,1,1,0,0,36.809607,1
8,0,0,0,0,0,1,23.471339,0
9,0,0,0,0,1,1,23.231168,0


In [10]:
df.describe()

Unnamed: 0,rs1047763,rs9282541,rs3827760,rs4988235,rs1801133,rs9374842,BMI,CardiovascularDisease
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,0.433333,0.326667,0.49,0.316667,0.286667,0.276667,28.899291,0.113333
std,0.496364,0.469778,0.500735,0.465953,0.45296,0.448098,5.17193,0.317529
min,0.0,0.0,0.0,0.0,0.0,0.0,13.798057,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,25.292649,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,29.185791,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,32.13121,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,44.188743,1.0


In [11]:
df.dtypes

rs1047763                  int64
rs9282541                  int64
rs3827760                  int64
rs4988235                  int64
rs1801133                  int64
rs9374842                  int64
BMI                      float64
CardiovascularDisease      int64
dtype: object

## Initial Data Cleaning
Getting an overview over the problems in the data, using the VS Code Extension Data Wrangler from Microsoft, then fix them.

- Preparing Data
- Encoding categorical variables as needed, preparing data for Model 

NOT including any ML-method of imputation in this step as I haven't performed the train-test split yet and fitting the imputer on the whole df could risk data leakage -> will do this after EDA/Train-Test-Split.

In [None]:
# to be implemented after using DataWrangler VS Code extension

## Exploratory Data Analysis before Imputation 
Gettng an overview of the data -> how is the data spread out. 
For each column:
- Histogram: display the frequency of data points within specified bins, providing a visual representation of the distribution of a dataset.
- Density Plot:  visualize the distribution of data by estimating the probability density function, showing where values are concentrated -> represent probability distribution

### Histograms / Density Plots

In [None]:
import seaborn as sns

# Set the style of seaborn plots
sns.set_theme(style="whitegrid")

# Function to plot distribution of each column
def plot_distributions(df):
    for column in df.columns:
        plt.figure(figsize=(10, 5))
        
        # Histogram
        plt.subplot(1, 2, 1)
        sns.histplot(df[column], kde=False, bins=30)
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        
        # Density plot (KDE)
        plt.subplot(1, 2, 2)
        sns.kdeplot(df[column], fill=True)
        plt.title(f'Density Plot of {column}')
        plt.xlabel(column)
        plt.ylabel('Density')
        
        plt.tight_layout()
        plt.show()
        
        # Display basic statistics
        print(f'Statistics for {column}:')
        print(df[column].describe())
        print('\nSkewness:', df[column].skew())
        print('\nKurtosis:', df[column].kurtosis())
        print('\n')

# Plot distributions
plot_distributions(df)

### Correlation Matrix

In [None]:
corr_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True, linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

## Train-Test Split
Using the train_test_split from sklearn.model_selection to split the data
- split into train and interim test set
- split interim test set into val and test set
- export test set to csv
- delete interim_test_set and test_set from notebook -> helps ensure prevention of data leakage

In [None]:
# First split: 70-30 train-test split, with interim test set
train_set, interim_test_set = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
X_train = train_set.drop('targetvariable', axis=1)
y_train = train_set['targetvariable']

In [None]:
# Second split: 50-50 validation-test split
val_set, test_set = train_test_split(interim_test_set, test_size=0.5, random_state=42)

In [None]:
X_val = train_set.drop('targetvariable', axis=1)
y_val = train_set['targetvariable']

In [None]:
# Export the test set to a CSV file
test_set.to_csv('test_set.csv', index=False)

del test_set, interim_test_set

## Imputation

In [None]:
def evaluate_imputation(X, y, imputer, model):
    # Set up the pipeline
    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', StandardScaler()),  # Scaling is applied after imputation
        ('regressor', model)
    ])
    
    # Set up cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Cross-validate the pipeline
    scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=kf)
    
    # Return average MSE
    return np.mean(-scores)

In [None]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Initialize the Iterative Imputer using RandomForest
rf_imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=42), random_state=42)

# Initialize the regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Evaluate KNN imputer
mse_knn = evaluate_imputation(X_train, y_train, knn_imputer, model)
print(f"KNN Imputer MSE: {mse_knn}")

# Evaluate Random Forest imputer (IterativeImputer)
mse_rf = evaluate_imputation(X_train, y_train, rf_imputer, model)
print(f"Random Forest Imputer MSE: {mse_rf}")

# Determine the best imputer
best_imputer = 'KNN' if mse_knn < mse_rf else 'Random Forest'
print(f"Best imputer selected based on cross-validated MSE: {best_imputer}")

### Fitting the selected Imputer on the Training & Validation Set

In [None]:
### TODO - Implement the best imputer - delete other imputer

In [None]:
# Fit KNN imputer on the training data
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(X_train)
X_val_imputed = knn_imputer.transform(X_val)

In [None]:
# Fit Iterative Imputer using RandomForest on the training data
rf_imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=42), random_state=42)
X_train_imputed = rf_imputer.fit_transform(X_train)
X_val_imputed = rf_imputer.transform(X_val)

## Model Selection & Training with TPOT
Using TPOTClassifier (Tree-based Pipeline Optimization Tool) to automate the selection and training of the best predictive model based on the cleaned training dataset. This tool explores various models and hyperparameter settings to find the optimal solution. As TPOT can also Impute missing values, I am trying out these two things:
- TPOT Classifier on Data without Imputation
- TPOT Classifier on Data with Imputation from previous step
In order to find out which has better scores.



### TPOT with dataframe with NaN

In [None]:
# Instantiate and train the TPOT classifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, scoring='accuracy', random_state=42)
tpot.fit(X_train, y_train)

# Evaluate the classifier on the validation set
print("Validation Accuracy: ", tpot.score(X_val, y_val))

# Export the best model
tpot.export('tpot_best_model.py')

### TPOT with imputed dataframes

In [None]:
# Instantiate and train the TPOT classifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, scoring='accuracy', random_state=42)
tpot.fit(X_train_imputed, y_train)

# Evaluate the classifier on the validation set
print("Validation Accuracy: ", tpot.score(X_val_imputed, y_val))

# Export the best model
tpot.export('tpot_best_model.py')

## Model Evaluation
In this step, I analyse the output the best model from the TPOT Classifier using various metrics and visualize results.

### Accuracy

In [None]:
# Assuming 'model' is your trained model and 'X_val' and 'y_val' are your validation dataset features and target respectively
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

### Classification Report

In [None]:
class_report = classification_report(y_val, y_pred)
print("Classification Report:\n", class_report)

### Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_val, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Neg', 'Predicted Pos'], yticklabels=['Actual Neg', 'Actual Pos'])
plt.xlabel('Predicted Labels')
plt.ylabel('Actual Labels')
plt.title('Confusion Matrix')
plt.show()

#### ROC-AUC Score

In [None]:
# This requires that your model supports probability estimates via predict_proba and the target variable is binary
y_prob = model.predict_proba(X_val)[:, 1]  # get the probability of the positive class
roc_auc = roc_auc_score(y_val, y_prob)
print(f"ROC-AUC Score: {roc_auc:.2f}")

## Model Explanation

### Global Methods

#### SHAP Summary Plot

#### Permutation Feature Importance

In [None]:
result = permutation_importance(model, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(10, 7))
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=X_val.columns[sorted_idx])
plt.title('Permutation Importance of each feature')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

### Local Methods

#### LIME

#### SHAP Values

## Bias Identification and Mitigation
I am using Fairlearn, a library that was initially developed by researchers at Microsoft. It is now mainained as an open-source project to aid in assessing and mitigating fairness issues in AI. 

#### SMOTE (Oversampling)
Oversamples the minority class with synthetic samples.

In [None]:
# Applying SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(f'Original dataset shape {Counter(y_train)}')
print(f'Resampled dataset shape {Counter(y_smote)}')

#### NearMiss (Undersampling)
Undersamples instances that are particularly close to instances of the minority class.

In [None]:
# Applying NearMiss
nm = NearMiss(version=1)
X_nm, y_nm = nm.fit_resample(X_train, y_train)

print(f'Original dataset shape {Counter(y_train)}')
print(f'Resampled dataset shape {Counter(y_nm)}')

#### Class-weight Balancing
Balances the weight of the individual classes