# XGBoost

In [1]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import os
import xgboost as xgb
from pathlib import Path

In [2]:
CITY = "palermo"

data_dir = "../../preprocessed/" 
# features_dir = data_dir + "features/features_all/"
separate_features_dir = data_dir + "features/features_separate_cities/"

labels_dir = data_dir + "labels/" 

standardize_features = True

PCA_components = 64


USE_GEO = "GEO"

network_type = "vgg16_4096"

standardize_features = True


cities = ['milano', 'bologna', 'firenze', 'palermo', 'torino']

In [3]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

In [4]:
land_use = [
"LUM5_single","RNR_nres","mdist_smallparks",
"hType_mix", "nig_rat_daily", "mdist_nres_daily",
"num_community_places", "num_community_places_poi"]


small_blocks = [
"avg_block_area","num_intersect", "sphi"]


age_buildings = [
"bld_avg_age","enterprises_empl_size"]

concentration = [
"pop_rat_num","emp_rat_num","emp_rat_pop",
"bld_rat_area","den_nres_daily","den_nres_non-daily"]

vacuums = [
"mdist_parks", "mdist_railways",
"mdist_highways", "mdist_water"]

## Functions

In [5]:
if network_type == "vgg19":
	features_file = "Italy_6_cities_vgg19_pca"+str(PCA_components)+"_linear_fc_thirdlast_layer.csv"
elif network_type == "resnet50":
	features_file = "Italy_6_cities_resnet_pca"+str(PCA_components)+"_second_last_layer.csv"
elif network_type == "vgg16_4096":
	features_file = "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096.csv"

In [6]:
def get_normalized_labels_features(city_name=CITY):

    df = pd.read_csv(separate_features_dir + \
        features_file.replace(".csv", "_" + city_name + "_labels_features.csv"))

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)
    
    del df['imageName']
    del df['city']
    del df['index']
    return df

In [180]:
def predict_label_i_KFold(city_name= CITY, label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    
    try:
        data3 = data2[data2[label] != 1 ].copy()
    except:
        return ({"Accuracy": (0, 0),
            "Precision" : (0, 0),
            "Recall": (0, 0),
            "AUC": (0, 0)  },
            {"Accuracy": 0,
            "Precision" : 0,
            "Recall": 0,
            "AUC": 0 })
    
    target = data3[label].apply(lambda x: int(x) if x == 0 else 1)
    
    
    if USE_GEO == "GEO":
        features = data3[[c for c in data.columns if ("PCA" in c) or ("centroid" in c)]]
    else:
        features = data3[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    acc = []
    auc = []
    P = []
    R = []
    
#     print (y_resampled)
#     print (sum(y_resampled),len(y_resampled))
    

    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        param_dist = {'objective':'binary:logistic', 'n_estimators':16}
        clf = xgb.XGBModel(**param_dist)
        
        clf.fit(X_train, y_train,
#                 eval_metric='auc',
                verbose=False)
        
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions.round())
        precision=precision_score(y_test, predictions.round())
        recall=recall_score(y_test, predictions.round())
        roc=roc_auc_score(y_test,predictions)
        
#         print (predictions)
#         print (predictions.round())
#         print (y_test)
#         print (sum(y_test), len(y_test))
        
        acc.append(accuracy)
        auc.append(roc)
        P.append(precision)
        R.append(recall)
        
    return  ( {"Accuracy": (np.mean(acc), np.std(acc)),
            "Precision" : (np.mean(P), np.std(P)),
            "Recall": (np.mean(R), np.std(R)),
            "AUC": (np.mean(auc), np.std(auc))  },
            {"Accuracy": np.mean(acc),
            "Precision" : np.mean(P),
            "Recall": np.mean(R),
            "AUC": np.mean(auc) })

## Read in Data. Chose standardized or not.

In [181]:
if standardize_features:
    data = get_normalized_labels_features()
else:
    data = get_labels_features()
data.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density,city_image
0,-1.106497,-0.504827,-0.512352,-1.168296,0.746623,0.108426,0.904777,-0.87933,-0.142061,0.283723,...,1.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,palermo_S2A_MSIL2A_20180712T095031_N0208_R079_...
1,-0.61291,-1.140416,-1.333048,0.390619,-0.511958,-0.390305,-0.58343,-1.166956,-0.912854,-0.530309,...,1.0,1.0,1.0,0.0,0.0,2.0,1.0,2.0,1.0,palermo_S2A_MSIL2A_20180712T095031_N0208_R079_...
2,-0.246564,1.098204,0.977962,0.630964,0.436412,0.291143,0.473744,0.854861,0.557248,-0.009829,...,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,palermo_S2A_MSIL2A_20180712T095031_N0208_R079_...
3,1.124777,-1.439091,-2.192749,1.12807,-1.644134,-2.320422,-0.403868,-1.219822,-0.829199,-1.621165,...,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,palermo_S2A_MSIL2A_20180712T095031_N0208_R079_...
4,0.055417,0.613356,0.921239,-0.766032,-0.076338,0.113436,-0.642975,-1.016053,0.574855,1.227559,...,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,palermo_S2A_MSIL2A_20180712T095031_N0208_R079_...


In [182]:
data.columns

Index(['PCA0', 'PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8',
       'PCA9', 'PCA10', 'PCA11', 'PCA12', 'PCA13', 'PCA14', 'PCA15', 'PCA16',
       'PCA17', 'PCA18', 'PCA19', 'PCA20', 'PCA21', 'PCA22', 'PCA23', 'PCA24',
       'PCA25', 'PCA26', 'PCA27', 'PCA28', 'PCA29', 'PCA30', 'PCA31', 'PCA32',
       'PCA33', 'PCA34', 'PCA35', 'PCA36', 'PCA37', 'PCA38', 'PCA39', 'PCA40',
       'PCA41', 'PCA42', 'PCA43', 'PCA44', 'PCA45', 'PCA46', 'PCA47', 'PCA48',
       'PCA49', 'PCA50', 'PCA51', 'PCA52', 'PCA53', 'PCA54', 'PCA55', 'PCA56',
       'PCA57', 'PCA58', 'PCA59', 'PCA60', 'PCA61', 'PCA62', 'PCA63',
       'centroid_x', 'centroid_y', 'label_bld_avg_age', 'label_hType_mix',
       'label_num_intersect', 'label_LUM5_single', 'label_RNR_nres',
       'label_mdist_smallparks', 'label_nig_rat_daily', 'label_nig_rat_daily3',
       'label_mdist_nres_daily', 'label_num_community_places',
       'label_num_community_places_poi', 'label_avg_block_area', 'label_sphi',
       'labe

## Predict K-Fold

In [183]:
predict_label_i_KFold(CITY, label="label_hType_mix")

({'Accuracy': (0.8414285714285714, 0.10694267012693087),
  'Precision': (0.8380952380952381, 0.10583862271853747),
  'Recall': (0.8618181818181817, 0.15019546493137007),
  'AUC': (0.9152727272727272, 0.06124709471942525)},
 {'Accuracy': 0.8414285714285714,
  'Precision': 0.8380952380952381,
  'Recall': 0.8618181818181817,
  'AUC': 0.9152727272727272})

In [184]:
kfold_SCORES = {}
kfold_SCORES2 = {}
for col in label_columns:
    label = "label_" + col
    (res1, res2) = predict_label_i_KFold(CITY, label)
    kfold_SCORES[label] = res1
    kfold_SCORES2[label] = res2

In [185]:
kfold_SCORES2

{'label_hType_mix': {'Accuracy': 0.8414285714285714,
  'Precision': 0.8380952380952381,
  'Recall': 0.8618181818181817,
  'AUC': 0.9152727272727272},
 'label_num_intersect': {'Accuracy': 0.873076923076923,
  'Precision': 0.9166666666666667,
  'Recall': 0.8428571428571429,
  'AUC': 0.9682539682539684},
 'label_bld_avg_age': {'Accuracy': 0.9679487179487181,
  'Precision': 0.9428571428571428,
  'Recall': 1.0,
  'AUC': 0.9904761904761905},
 'label_emp_rat_num': {'Accuracy': 0.8307692307692307,
  'Precision': 0.8733333333333333,
  'Recall': 0.8428571428571429,
  'AUC': 0.9476190476190476},
 'label_LUM5_single': {'Accuracy': 0.8075098814229248,
  'Precision': 0.7946428571428571,
  'Recall': 0.8787878787878789,
  'AUC': 0.8767217630853995},
 'label_RNR_nres': {'Accuracy': 0.8679841897233201,
  'Precision': 0.8858974358974357,
  'Recall': 0.8393939393939392,
  'AUC': 0.8979338842975209},
 'label_mdist_smallparks': {'Accuracy': 0.8723809523809523,
  'Precision': 0.885,
  'Recall': 0.85714285714

In [186]:
res = pd.DataFrame(kfold_SCORES2)

In [187]:
res

Unnamed: 0,label_hType_mix,label_num_intersect,label_bld_avg_age,label_emp_rat_num,label_LUM5_single,label_RNR_nres,label_mdist_smallparks,label_nig_rat_daily,label_nig_rat_daily3,label_mdist_nres_daily,...,label_pop_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density
AUC,0.915273,0.968254,0.990476,0.947619,0.876722,0.897934,0.939286,0.915041,0.963265,0.932653,...,0.864,0.725069,0.983333,0.957143,0.848,0.914286,0.819259,0.925247,0.973669,0.911111
Accuracy,0.841429,0.873077,0.967949,0.830769,0.80751,0.867984,0.872381,0.870563,0.885714,0.83619,...,0.76,0.649802,0.9,0.83956,0.77,0.816484,0.753801,0.826462,0.930154,0.820513
Precision,0.838095,0.916667,0.942857,0.873333,0.794643,0.885897,0.885,0.871667,0.889286,0.872778,...,0.76,0.669286,0.909524,0.865556,0.722611,0.81,0.781883,0.863258,0.903663,0.808333
Recall,0.861818,0.842857,1.0,0.842857,0.878788,0.839394,0.857143,0.885455,0.885714,0.785714,...,0.74,0.678788,0.9,0.82381,0.82,0.804762,0.722222,0.811538,0.969231,0.871429


In [188]:
if standardize_features:
    out_name = '../../results/XGBoost/separate_cities/XGBoost'\
    +str(PCA_components) + '_' + CITY + '_' + network_type \
    + '_' + USE_GEO + '_standardized7s.csv'
else:
    out_name = '../../results/XGBoost/XGBoost' +str(PCA_components)+\
    + '_' + CITY + '_' + network_type + \
    '_' + USE_GEO + '7s.csv'
res.to_csv(out_name, float_format='%.3f')

In [189]:
res_dir = '../../results/XGBoost/separate_cities/' + CITY + '_' + network_type \
         + '_' + str(PCA_components) + '_' + USE_GEO
Path(res_dir).mkdir(parents=True, exist_ok=True)

In [190]:
res

Unnamed: 0,label_hType_mix,label_num_intersect,label_bld_avg_age,label_emp_rat_num,label_LUM5_single,label_RNR_nres,label_mdist_smallparks,label_nig_rat_daily,label_nig_rat_daily3,label_mdist_nres_daily,...,label_pop_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density
AUC,0.915273,0.968254,0.990476,0.947619,0.876722,0.897934,0.939286,0.915041,0.963265,0.932653,...,0.864,0.725069,0.983333,0.957143,0.848,0.914286,0.819259,0.925247,0.973669,0.911111
Accuracy,0.841429,0.873077,0.967949,0.830769,0.80751,0.867984,0.872381,0.870563,0.885714,0.83619,...,0.76,0.649802,0.9,0.83956,0.77,0.816484,0.753801,0.826462,0.930154,0.820513
Precision,0.838095,0.916667,0.942857,0.873333,0.794643,0.885897,0.885,0.871667,0.889286,0.872778,...,0.76,0.669286,0.909524,0.865556,0.722611,0.81,0.781883,0.863258,0.903663,0.808333
Recall,0.861818,0.842857,1.0,0.842857,0.878788,0.839394,0.857143,0.885455,0.885714,0.785714,...,0.74,0.678788,0.9,0.82381,0.82,0.804762,0.722222,0.811538,0.969231,0.871429


In [191]:
land_use_cols = ["label_"+l for l in land_use]
res_land_use = res[land_use_cols]

In [192]:
small_blocks_cols = ["label_"+l for l in small_blocks]
res_small_blocks = res[small_blocks_cols]

In [193]:
age_buildings_cols = ["label_"+l for l in age_buildings]
res_age_buildings = res[age_buildings_cols]

In [194]:
concentration_cols = ["label_"+l for l in concentration]
res_concentration = res[concentration_cols]

In [195]:
vacuums_cols = ["label_"+l for l in vacuums]
res_vacuums = res[vacuums_cols]

In [196]:
for out_cat_name in ["land_use", "small_blocks", \
                     "age_buildings", "concentration",
                    "vacuums"]:
    eval("res_" + out_cat_name).to_csv(res_dir + "/res_" + out_cat_name+\
                                       ".csv", float_format='%.3f')

# Latex Table for the AUC Results for all cities

In [7]:
cities = ['milano', 'bologna', 'firenze', 'palermo', 'torino']

In [14]:
def predict_label_i_KFold_city(city_name, label):
    
    data = get_normalized_labels_features(city_name=city_name)
    
    data2 = data.copy()
    
    try:
        data3 = data2[data2[label] != 1 ].copy()
    except Exception as e:
        print (e)
        return None
    
    target = data3[label].apply(lambda x: int(x) if x == 0 else 1)
    
    if USE_GEO == "GEO":
        features = data3[[c for c in data.columns if ("PCA" in c) or ("centroid" in c)]]
    else:
        features = data3[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    acc = []
    auc = []
    P = []
    R = []
    
#     print (y_resampled)
#     print (sum(y_resampled),len(y_resampled))
    kf = StratifiedKFold(n_splits=5)

    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        param_dist = {'objective':'binary:logistic', 'n_estimators':16}
        clf = xgb.XGBModel(**param_dist)
        
        clf.fit(X_train, y_train,
#                 eval_metric='auc',
                verbose=False)
        
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions.round())
        precision=precision_score(y_test, predictions.round())
        recall=recall_score(y_test, predictions.round())
        roc=roc_auc_score(y_test,predictions)
        
#         print (predictions)
#         print (predictions.round())
#         print (y_test)
#         print (sum(y_test), len(y_test))
        
        acc.append(accuracy)
        auc.append(roc)
        P.append(precision)
        R.append(recall)
        
    return  np.mean(auc)

In [16]:
city_label_res = {}

for city_name in cities:
    city_label_res[city_name] = {}
    for col in land_use:
        label = "label_" + col
        res_cl = predict_label_i_KFold_city(city_name, label)
        city_label_res[city_name][col] = res_cl
df_city_label_res = pd.DataFrame(city_label_res)
df_city_label_res.to_latex("../../results/writing_tables/AUC_per_land_use_var_per_city.tex", \
                           float_format="{:.2f}".format )
  

for city_name in cities:
    city_label_res[city_name] = {}
    for col in small_blocks:
        label = "label_" + col
        res_cl = predict_label_i_KFold_city(city_name, label)
        city_label_res[city_name][col] = res_cl
df_city_label_res = pd.DataFrame(city_label_res)
df_city_label_res.to_latex("../../results/writing_tables/AUC_per_small_blocks_var_per_city.tex", \
                           float_format="{:.2f}".format )    
   
for city_name in cities:
    city_label_res[city_name] = {}
    for col in age_buildings:
        label = "label_" + col
        res_cl = predict_label_i_KFold_city(city_name, label)
        city_label_res[city_name][col] = res_cl
df_city_label_res = pd.DataFrame(city_label_res)
df_city_label_res.to_latex("../../results/writing_tables/AUC_per_age_buildings_var_per_city.tex", \
                           float_format="{:.2f}".format )      
    
for city_name in cities:
    city_label_res[city_name] = {}
    for col in concentration:
        label = "label_" + col
        res_cl = predict_label_i_KFold_city(city_name, label)
        city_label_res[city_name][col] = res_cl
df_city_label_res = pd.DataFrame(city_label_res)
df_city_label_res.to_latex("../../results/writing_tables/AUC_per_concentration_var_per_city.tex", \
                           float_format="{:.2f}".format )    
  
for city_name in cities:
    city_label_res[city_name] = {}
    for col in vacuums:
        label = "label_" + col
        res_cl = predict_label_i_KFold_city(city_name, label)
        city_label_res[city_name][col] = res_cl
df_city_label_res = pd.DataFrame(city_label_res)
df_city_label_res.to_latex("../../results/writing_tables/AUC_per_vacuums_var_per_city.tex", \
                           float_format="{:.2f}".format )      

'label_bld_rat_area'
'label_bld_rat_area'
