In [1]:
# This script is for GRF, socio variables
# Take Dataset3 LA for example

In [2]:
# Packages
import pandas as pd
import numpy as np
import geopandas as gpd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import statsmodels.api as sm

from scipy.spatial import distance

# Geographical RandomForest

In [3]:
class GeographicalRandomForest:
    # this is the initialization function
    # param local_model_num controls how many local models will participate in prediction; default is 1 !!!New
    def __init__(self, ntree, mtry, band_width, local_weight, local_model_num=1, bootstrap=False, random_seed=42):
        self.ntree = ntree
        self.mtry = mtry
        self.band_width = band_width
        self.local_weight = local_weight
        self.local_model_num = local_model_num
        self.bootstrap=bootstrap
        self.random_seed = random_seed
        self.global_model = None
        self.local_models = None
        self.train_data_coords = None
        self.distance_matrix = None
        self.train_data_index = None
        self.train_data_columns = None
       
    
    # param X_train contains a data frame of the the training indepdent variables 
    # param y_train contains a data series of the target dependent variable
    # param coords contains a data frame of the two-dimensional coordinates
    # param record_index contains a data series of the indices of the data for helping store local models
    def fit(self, X_train, y_train, coords, record_index):
        
        # save the index of the training data
        self.train_data_index = record_index
        self.train_data_columns = X_train.columns
        
        # get Global RF model and importance information, and save global RF model
        rf_global = RandomForestRegressor(bootstrap = self.bootstrap, n_estimators = self.ntree, max_features = self.mtry, random_state = self.random_seed) 
        rf_global.fit(X_train, y_train)
        self.global_model = rf_global
        
        
        # create an empty dictionary for local models
        self.local_models = {}
        
        # get the distance matrix between the training geographic features
        coords_array = np.array(coords, dtype = np.float64) # translate (x,y) to array type
        self.train_data_coords = coords_array
        self.distance_matrix = distance.cdist(coords_array,coords_array, 'euclidean') # calculate Euclidean Distance
        
        # train local models
        for i in range(len(X_train)):
            distance_array = self.distance_matrix[i]
            idx = np.argpartition(distance_array, self.band_width)  # Get the index of the geographic features that are the nearest to the target geographic feature
            idx = idx[:self.band_width]  # only those indices within the band_width are valid 
            
            local_X_train = X_train.iloc[idx]
            local_y_train = y_train.iloc[idx]
            
            # make local tree size smaller, because there is no sufficient data to train a big tree !!!New
            local_tree_size = int(self.ntree * (self.band_width*1.0/len(X_train)))
            if local_tree_size < 1:
                local_tree_size = 1  # local tree size should be at least 1
             
            # get local model
            rf_local = RandomForestRegressor(bootstrap = self.bootstrap, n_estimators = local_tree_size, max_features = self.mtry, random_state = self.random_seed) # input
            rf_local.fit(local_X_train, local_y_train)
            
            # key for storing local rf model in a dictionary
            rf_local_key = str(record_index.iloc[i])+"|"+ str(coords_array[i][0])+"|"+str(coords_array[i][1])
            self.local_models[rf_local_key] = rf_local
            
    
    # the function for making predictions using the GRF model
    # param X_test contains a data frame of the independent variables in the test dataset
    # param coords contains a data frame of the two-dimensional coordinates
    def predict(self, X_test, coords_test): 
        
        # first, make prediction using the global RF model 
        predict_global = self.global_model.predict(X_test).flatten() # get the global predict y first
        
        # Second, make prediction using the local RF model 
        coords_test_array = np.array(coords_test, dtype = np.float64)
        distance_matrix_test_to_train = distance.cdist(coords_test_array, self.train_data_coords, 'euclidean')
        predict_local = []
        
        for i in range(len(X_test)):
            distance_array = distance_matrix_test_to_train[i]
            idx = np.argpartition(distance_array, self.local_model_num)  # Get the index of the geographic features that are the nearest to the target geographic feature
            idx = idx[:self.local_model_num]
            
            this_local_prediction = 0
            for this_idx in idx:
                local_model_key = str(self.train_data_index.iloc[this_idx])+"|"+ str(self.train_data_coords[this_idx][0])+"|"+str(self.train_data_coords[this_idx][1])
                local_model = self.local_models[local_model_key]
                this_local_prediction += local_model.predict(X_test[i:i+1]).flatten()[0]
            
            this_local_prediction = this_local_prediction*1.0 / self.local_model_num  # average local predictions
            predict_local.append(this_local_prediction)
          
        
        # Third, combine global and local predictions
        predict_combined = []
        for i in range(len(predict_global)):
            this_combined_prediction = predict_local[i]*self.local_weight + predict_global[i]*(1-self.local_weight) 
            predict_combined.append(this_combined_prediction)
        
        
        return predict_combined, predict_global, predict_local   # return three types of predictions
    
    
    # this function outputs the local feature importance based on the local models
    def get_local_feature_importance(self):
        if self.local_models == None:
            print("The model has not been trained yet...")
            return None
        
        column_list = [self.train_data_index.name] 
        for column_name in self.train_data_columns: 
            column_list.append(column_name) 
            
        feature_importance_df = pd.DataFrame(columns = column_list) 
        
        for model_key in self.local_models.keys():
            model_info = model_key.split("|")
            this_local_model = self.local_models[model_key]
            this_row = {}
            this_row[self.train_data_index.name] = model_info[0] # the index of a row
            for feature_index in range(0, len(self.train_data_columns)):
                this_row[self.train_data_columns[feature_index]]=this_local_model.feature_importances_[feature_index]
            
            feature_importance_df = feature_importance_df.append(this_row, ignore_index = True) # TypeError: Can only append a dict if ignore_index=True
            
            
        return feature_importance_df

# Prepare data

In [4]:
X_socio = pd.read_csv("../02 Dataset/05 Dataset 3/Socio_LA.csv") #input
ct_shp = gpd.read_file("../02 Dataset/07 Coordinates info for GWR/LA_CDC data_Tract_Ob_pro.shp") # input
ct_shp['GEOID'] = ct_shp['GEOID'].astype('int64')
X_socio_1 = X_socio.merge(ct_shp, left_on = 'GEOID', right_on = 'GEOID', how = 'left')
X_socio_2 = X_socio_1.set_index('GEOID')
Y_2 = X_socio_2.pop('obesity_cr')
del X_socio_2['geometry']
len(X_socio_2)

947

In [5]:
def standarize_data(data, stats):
    return (data - stats['mean'])/ stats['std']

# GRF 10K-Fold local_w = 0.5, local_model_num = 40 ☆☆☆

In [6]:
y_rf_socio_predict = []
y_true = []

ten_fold = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in ten_fold.split(X_socio_2):
    print("TEST:", test_index)

    X_train_1, X_test_1 = X_socio_2.iloc[train_index], X_socio_2.iloc[test_index]
    y_train, y_test = Y_2.iloc[train_index], Y_2.iloc[test_index]
    X_train = X_train_1[['% Black','% Ame Indi and AK Native','% Asian','% Nati Hawa and Paci Island','% Hispanic or Latino','% male','% married','% age 18-29','% age 30-39','% age 40-49','% age 50-59','% age >=60','% <highschool','median income','% unemployment','% below poverty line','% food stamp/SNAP','median value units built','median year units built','% renter-occupied housing units','population density']]
    X_test = X_test_1[['% Black','% Ame Indi and AK Native','% Asian','% Nati Hawa and Paci Island','% Hispanic or Latino','% male','% married','% age 18-29','% age 30-39','% age 40-49','% age 50-59','% age >=60','% <highschool','median income','% unemployment','% below poverty line','% food stamp/SNAP','median value units built','median year units built','% renter-occupied housing units','population density']]
    xy_coord = X_train_1[["Lonpro","Latpro"]]
    train_index_1 = X_train.index
    train_index = pd.Series(train_index_1)
    coords_test = X_test_1[["Lonpro","Latpro"]]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)
    
    grf = GeographicalRandomForest(890, 10, 377, 0.5, local_model_num = 40) # need to change
    grf.fit(scaled_X_train, y_train, xy_coord, train_index)
    
    predict_combined, predict_global, predict_local = grf.predict(scaled_X_test,coords_test)
    y_rf_socio_predict = y_rf_socio_predict + predict_combined
    y_true = y_true + y_test.tolist()

TEST: [ 23  30  39  44  59  63  67  70  72  76  86  88  96 107 120 136 139 165
 168 198 208 209 215 218 247 250 259 260 265 275 280 292 294 298 310 312
 327 331 332 333 363 365 377 388 394 439 449 453 457 464 465 478 481 495
 500 513 518 519 527 541 554 559 569 589 591 599 616 617 643 644 653 658
 673 679 684 695 708 753 762 767 777 778 783 792 804 816 826 850 858 874
 883 908 926 932 942]
TEST: [ 31  33  49  60  65  66  78 110 137 141 158 174 192 199 210 213 231 235
 254 261 266 286 296 302 306 307 309 311 314 316 326 328 334 342 352 361
 371 381 405 423 428 430 433 442 447 482 483 493 507 525 531 543 545 570
 572 576 594 602 605 606 615 625 630 685 689 694 707 732 735 736 737 746
 780 784 788 813 814 817 819 820 835 838 852 854 866 872 886 889 896 904
 910 925 941 944 945]
TEST: [  2   5   7  10  25  29  54  55  77  81  82  84  97 101 109 118 155 196
 204 211 227 228 239 244 281 318 319 321 323 344 346 350 355 357 380 398
 408 411 412 420 424 425 444 451 456 468 494 514 516 526 529 5

In [7]:
rf_socio_rmse = mean_squared_error(y_true , y_rf_socio_predict, squared=False) #False means return RMSE value
rf_socio_r2 = r2_score(y_true, y_rf_socio_predict)
# sociodemographic - estimators
print("RMSE of the RF model with sociodemographic predictors: "+str(rf_socio_rmse))
print("R2 of the RF model with sociodemographic predictors: "+str(rf_socio_r2)) # For R2, I took this one.

RMSE of the RF model with sociodemographic predictors: 1.203717344914799
R2 of the RF model with sociodemographic predictors: 0.9509605556367354
