In [None]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
import libpysal
import esda
import sys
import numpy as np
import shap
import PyGRF

In [None]:
data_311 = pd.read_csv("../02 Data/modeling_grf.csv")
print("len(data_311): ", len(data_311))

In [None]:
# function for standarizing variables
def standarize_data(data, stats):
    return (data - stats['mean']) / stats['std']

def pygrf_311_predict(X):
  coords_2 = X[['lon','lat']]
  X_2 = X[X_columns]
  return pygrf_311.predict(X_2, coords_2, local_weight=0.14163829762242533)[0]

In [None]:
# get columns for only dependent variables
columns_to_exclude = ['Geography', 'lon', 'lat', '311_index_per_property']
X_columns = [column for column in data_311.columns if column not in columns_to_exclude]

K_fold = KFold(n_splits=10, shuffle=True, random_state=42)

i = 0
for train_index, test_index in K_fold.split(data_311):
    print("fold:", i)
    print("train:", train_index)
    print("TEST:", test_index)

    # get the training and test data in each fold
    X_train_all, X_test_all = data_311.iloc[train_index], data_311.iloc[test_index]
    X_train_all.to_csv("../02 Data/SHAP result/{}_X_train_all.csv".format(i))
    X_test_all.to_csv("../02 Data/SHAP result/{}_X_test_all.csv".format(i))
    y_train, y_test = X_train_all['311_index_per_property'], X_test_all['311_index_per_property']
    X_train = X_train_all[X_columns]
    X_test = X_test_all[X_columns]
    xy_coord = X_train_all[['lon', 'lat']]
    coords_test = X_test_all[['lon', 'lat']]

    # standarize dependent variables
    training_stat = X_train.describe().transpose()
    X_scaled_train_1 = standarize_data(X_train, training_stat)
    X_scaled_test_1 = standarize_data(X_test, training_stat)
    X_scaled_train_2 = pd.merge(X_scaled_train_1, xy_coord, left_index=True, right_index=True, how="left")
    X_columns_nolonlat = [column for column in X_scaled_train_2.columns if column not in ['lon', 'lat']]
    X_scaled_train = X_scaled_train_2[X_columns_nolonlat]
    X_scaled_test_2 = pd.merge(X_scaled_test_1, coords_test, left_index=True, right_index=True, how="left")
    X_scaled_test = X_scaled_test_2[X_columns_nolonlat]
    # X_columns_nodis = [column for column in X_scaled_train.columns if column not in ['distance']]

    # modeling
    pygrf_311 = PyGRF.PyGRFBuilder(n_estimators=130, max_features=22, max_depth = 10, band_width=15, train_weighted=True, predict_weighted=True, bootstrap=True,
                          resampled=True, random_state=42)
    pygrf_311.fit(X_scaled_train, y_train, xy_coord[['lon','lat']])
    predict_combined, predict_global, predict_local = pygrf_311.predict(X_scaled_test, coords_test[['lon','lat']], local_weight=0.14163829762242533)
    r2 = r2_score(y_test, predict_combined)
    print("R2: ", r2)

    # SHAP
    explainer = shap.Explainer(model = pygrf_311_predict, masker = X_scaled_train_2, algorithm = "tree", seed = 45)
    shap_values = explainer(X_scaled_train_2)
    shap_values_values = pd.DataFrame(shap_values.values, columns=X_scaled_train_2.columns)
    shap_values_values.to_csv("../02 Data/SHAP result/{}_shap_values_values.csv".format(i), index=False)
    shap_values_base_values = pd.DataFrame(shap_values.base_values)
    shap_values_base_values.to_csv("../02 Data/SHAP result/{}_shap_values_base_values.csv".format(i), index=False)
    shap_values_data = pd.DataFrame(shap_values.data, columns=X_scaled_train_2.columns)
    shap_values_data.to_csv("../02 Data/SHAP result/{}_shap_values_data.csv".format(i), index=False)

    i = i + 1