In [14]:
import pandas as pd
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/Users/amirsedaghatdoost/Library/CloudStorage/OneDrive-TexasA&MUniversity/Papers/Paper01/Analysis_2/Average depthwise properties.csv'
data = pd.read_csv(file_path)

# Define target variables and predictors, excluding unwanted variables
excluded_variables = ['Site', 'Hm', 'Sigma', 'EC', 'Ca', 'Mg', 'K', 'Mn', 'Zn']
target_variables = ['DOC', 'DON', 'NO3', 'NH4', 'Fe(II)', 'Fe(III)', 'Mn (IV)', 'SO4']
predictors = data.drop(columns=target_variables + excluded_variables)

# Convert all columns to float32 for compatibility
predictors = predictors.astype('float32')
data[target_variables] = data[target_variables].astype('float32')

# Map predictor names to their updated versions
updated_names = {
    "Feox": "Fe$_{ox}$",
    "Theta": "Soil moisture",
    "Ks": "K$_{s}$",
    "Bulk Density": "bulk density",
    "Alox": "Al$_{ox}$",
    "ThetaS": "Ɵ$_{s}$",
    "ThetaR": "Ɵ$_{r}$",
    "Alpha": "α",
    "DOC": "DOC",
    "DON": "DON",
    "NO3": "NO$_{3}$",
    "NH4": "NH$_{4}$",
    "Mn (IV)": "Mn(III)",
    "SO4": "SO$_{4}$"
}
predictor_names_updated = [updated_names.get(name, name) for name in predictors.columns]

# Convert predictors to NumPy array
X = predictors.to_numpy()

# SHAP analysis for each target variable
for target in target_variables:
    y = data[target].values

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest Regressor
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Use SHAP TreeExplainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)

    # Generate SHAP summary plot
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values, X_train, feature_names=predictor_names_updated, show=False)
    plt.title(f"SHAP Summary Plot for {updated_names.get(target, target)}", fontsize=14)
    plt.xlabel("SHAP value", fontsize=12)
    plt.tight_layout()
    plt.show()


TypeError: Cannot cast array data from dtype('int64') to dtype('int32') according to the rule 'safe'