<a href="https://colab.research.google.com/github/sanalpillai/Data-Cleaning-Feature-Selection-Modeling-and-Interpretability/blob/main/Data_Cleaning_Feature_Selection_Modeling_and_Interpretability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import shap
import h2o
from h2o.automl import H2OAutoML
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
# Cell 2: Data Loading and Initial Cleaning
data_url = "https://raw.githubusercontent.com/sanalpillai/Data-Cleaning-Feature-Selection-Modeling-and-Interpretability/main/Dataset/cleaned_data_cirrhosis.csv"
data = pd.read_csv(data_url)
data['Status'] = data['Status'].replace({'CL': 'C'})
data['Edema'] = data['Edema'].replace({'S': 'Y'})

In [None]:
# Cell 3: Handling Missing Values and Encoding
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)  # Replace with median if skewed

data = pd.get_dummies(data)  # One-Hot Encoding for categorical variables

In [None]:
# Cell 4: Feature Normalization
scaler = StandardScaler()
numeric_columns = data.select_dtypes(include=['float64', 'int']).columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

In [None]:
# Cell 5: Initializing H2O and Running AutoML
h2o.init(max_mem_size="4G")  # Adjust memory size according to your system's capacity
h2o_df = h2o.H2OFrame(data)

# Split the data into training and test sets
train, test = h2o_df.split_frame(ratios=[.75], seed=42)

# Specify target and features
target = "Status"  # Change according to your dataset
features = [x for x in train.columns if x != target]

# AutoML Configuration and Execution
aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=600)
aml.train(x=features, y=target, training_frame=train)

In [None]:
# Cell 6: Model Interpretability with SHAP
best_model = h2o.get_model(aml.leaderboard[0, "model_id"])
test_h2o = h2o.H2OFrame(X_test)  # Assuming X_test is your test set
shap_values = best_model.predict_contributions(test_h2o)

# Convert SHAP values to a pandas DataFrame for easier manipulation
shap_values_df = shap_values.as_data_frame()
shap.summary_plot(shap_values_df.to_numpy(), features=X_test, feature_names=X_test.columns)