# XGBoost

In [58]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import os
import xgboost as xgb
from pathlib import Path

In [39]:
CITY = "milano"
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/" + CITY + "/"
labels_dir = data_dir + "labels/" + CITY + "/"

PCA_components = 32

standardize_features = True

feature_size = 2048

network_type='VGG'

In [40]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water"]

In [86]:
land_use = [
"LUM5_single","RNR_nres","mdist_smallparks",
"hType_mix", "nig_rat_daily", "mdist_nres_daily",
"num_community_places", "num_community_places_poi"]


small_blocks = [
"avg_block_area","num_intersect", "sphi"]


age_buildings = [
"bld_avg_age","enterprises_empl_size"]

concentration = [
"pop_rat_num","emp_rat_num","emp_rat_pop",
"bld_rat_area","den_nres_daily","den_nres_non-daily"]

vacuums = [
"mdist_parks", "mdist_railways",
"mdist_highways", "mdist_water"]

## Functions

In [41]:
def get_labels_features():
    labels = pd.read_csv(labels_dir + "imagelet_labels_clean.csv")
    features = pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat" + str(feature_size) + \
                "_pca"+str(PCA_components) +".csv")
    data = pd.merge(features,labels, on="name", how="inner")
    return data

In [42]:
def get_normalized_labels_features(network_type=network_type):
    if network_type == 'ResnNet':
        return pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat" + str(feature_size) + \
                    "_pca"+ str(PCA_components) +"_labels_features.csv")
    else:
        return pd.read_csv(features_dir+"VGG16/df_VGG16_feat" + str(feature_size) + \
                    "_pca"+str(PCA_components) + "_labels_features.csv")

In [43]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["nameName", label]]
    target[label] = target[label].apply(lambda x: int(x) if x == 0 else 1)
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    clf = xgb.XGBClassifier().fit(X_resampled, y_resampled)
    
    print(clf.score(X_resampled, y_resampled), "Low", len(low), "High", len(high), "NOW", len(X_resampled)/2)

In [44]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    data3 = data2[data2[label] != 1 ].copy()
    
#     target = data2[["name", label]]
    target = data2[label].apply(lambda x: int(x) if x == 0 else 1)
    
    
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    acc = []
    auc = []
    P = []
    R = []
    
#     print (y_resampled)
#     print (sum(y_resampled),len(y_resampled))
    

    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        param_dist = {'objective':'binary:logistic', 'n_estimators':16}
        clf = xgb.XGBModel(**param_dist)
        
        clf.fit(X_train, y_train,
#                 eval_metric='auc',
                verbose=False)
        
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions.round())
        precision=precision_score(y_test, predictions.round())
        recall=recall_score(y_test, predictions.round())
        roc=roc_auc_score(y_test,predictions)
        
#         print (predictions)
#         print (predictions.round())
#         print (y_test)
#         print (sum(y_test), len(y_test))
        
        acc.append(accuracy)
        auc.append(roc)
        P.append(precision)
        R.append(recall)
        
    return  ( {"Accuracy": (np.mean(acc), np.std(acc)),
            "Precision" : (np.mean(P), np.std(P)),
            "Recall": (np.mean(R), np.std(R)),
            "AUC": (np.mean(auc), np.std(auc))  },
            {"Accuracy": np.mean(acc),
            "Precision" : np.mean(P),
            "Recall": np.mean(R),
            "AUC": np.mean(auc) })

## Read in Data. Chose standardized or not.

In [45]:
if standardize_features:
    data = get_normalized_labels_features()
else:
    data = get_labels_features()
data.head()

Unnamed: 0,index,imageName,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,...,label_pop_rat_num,label_emp_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
0,0,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.513386,-0.352732,0.716721,0.75541,0.647604,-0.517665,0.458571,1.024984,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
1,1,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.773019,-0.161784,-0.752245,-0.568582,-0.140501,0.249631,-0.200838,-0.406981,...,1.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0
2,2,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.305169,0.001131,0.561944,2.024687,0.274702,0.433355,-0.114322,-0.317631,...,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0
3,3,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.011016,-0.778505,1.975514,-1.7861,-0.994235,1.492313,-0.930522,1.202427,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
4,4,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,0.414278,-0.994471,1.94381,-0.590098,-0.26674,0.700525,1.007925,-0.608827,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0


## Predict K-Fold

In [46]:
predict_label_i_KFold(label="label_hType_mix")

({'Accuracy': (0.6214285714285716, 0.10059277341941256),
  'Precision': (0.6232945485886663, 0.106421179676151),
  'Recall': (0.621904761904762, 0.12228363742487959),
  'AUC': (0.6748299319727892, 0.12940556541725984)},
 {'Accuracy': 0.6214285714285716,
  'Precision': 0.6232945485886663,
  'Recall': 0.621904761904762,
  'AUC': 0.6748299319727892})

In [47]:
kfold_SCORES = {}
kfold_SCORES2 = {}
for col in label_columns:
    label = "label_" + col
    (res1, res2) = predict_label_i_KFold(label)
    kfold_SCORES[label] = res1
    kfold_SCORES2[label] = res2

In [48]:
kfold_SCORES2

{'label_hType_mix': {'Accuracy': 0.6214285714285716,
  'Precision': 0.6232945485886663,
  'Recall': 0.621904761904762,
  'AUC': 0.6748299319727892},
 'label_num_intersect': {'Accuracy': 0.7275862068965517,
  'Precision': 0.7135700992555831,
  'Recall': 0.7586206896551724,
  'AUC': 0.8135552913198574},
 'label_bld_avg_age': {'Accuracy': 0.590625,
  'Precision': 0.5848931587072005,
  'Recall': 0.625,
  'AUC': 0.62734375},
 'label_emp_rat_num': {'Accuracy': 0.6488461538461539,
  'Precision': 0.6447463429816371,
  'Recall': 0.6700757575757577,
  'AUC': 0.7300366950757576},
 'label_LUM5_single': {'Accuracy': 0.7776666666666666,
  'Precision': 0.8364102564102565,
  'Recall': 0.7192307692307692,
  'AUC': 0.8641025641025643},
 'label_RNR_nres': {'Accuracy': 0.4840455840455841,
  'Precision': 0.4866666666666667,
  'Recall': 0.5208791208791209,
  'AUC': 0.49171597633136094},
 'label_mdist_smallparks': {'Accuracy': 0.6214078374455733,
  'Precision': 0.6277643046608564,
  'Recall': 0.6133903133903

In [49]:
res = pd.DataFrame(kfold_SCORES2)

In [50]:
res

Unnamed: 0,label_hType_mix,label_num_intersect,label_bld_avg_age,label_emp_rat_num,label_LUM5_single,label_RNR_nres,label_mdist_smallparks,label_nig_rat_daily,label_nig_rat_daily3,label_mdist_nres_daily,...,label_enterprises_empl_size,label_pop_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
AUC,0.67483,0.813555,0.627344,0.730037,0.864103,0.491716,0.630539,0.477686,0.767124,0.803792,...,0.503806,0.754816,0.568331,0.805801,0.846222,0.578547,0.798,0.524691,0.653952,0.645041
Accuracy,0.621429,0.727586,0.590625,0.648846,0.777667,0.484046,0.621408,0.504545,0.691667,0.726667,...,0.517647,0.696552,0.558597,0.748362,0.788588,0.576471,0.753333,0.532862,0.598106,0.5963
Precision,0.623295,0.71357,0.584893,0.644746,0.83641,0.486667,0.627764,0.505271,0.70094,0.730664,...,0.516328,0.697228,0.565324,0.732787,0.788135,0.572222,0.769531,0.533639,0.596627,0.602369
Recall,0.621905,0.758621,0.625,0.670076,0.719231,0.520879,0.61339,0.536364,0.665927,0.723011,...,0.576471,0.696552,0.515692,0.785747,0.792414,0.611765,0.72,0.544709,0.616176,0.579221


In [51]:
if standardize_features:
    out_name = '../../results/XGBoost/XGBoost'\
    +str(PCA_components)+ '_' + network_type + str(feature_size) + '_standardized7s.csv'
else:
    out_name = '../../results/XGBoost/XGBoost' +str(PCA_components)+\
    '_' + network_type + str(feature_size) + '7s.csv'
res.to_csv(out_name, float_format='%.3f')

In [59]:
res_dir = '../../results/XGBoost/' + network_type \
         + str(feature_size)+ '_' + str(PCA_components)
Path(res_dir).mkdir(parents=True, exist_ok=True)

In [62]:
res

Unnamed: 0,label_hType_mix,label_num_intersect,label_bld_avg_age,label_emp_rat_num,label_LUM5_single,label_RNR_nres,label_mdist_smallparks,label_nig_rat_daily,label_nig_rat_daily3,label_mdist_nres_daily,...,label_enterprises_empl_size,label_pop_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
AUC,0.67483,0.813555,0.627344,0.730037,0.864103,0.491716,0.630539,0.477686,0.767124,0.803792,...,0.503806,0.754816,0.568331,0.805801,0.846222,0.578547,0.798,0.524691,0.653952,0.645041
Accuracy,0.621429,0.727586,0.590625,0.648846,0.777667,0.484046,0.621408,0.504545,0.691667,0.726667,...,0.517647,0.696552,0.558597,0.748362,0.788588,0.576471,0.753333,0.532862,0.598106,0.5963
Precision,0.623295,0.71357,0.584893,0.644746,0.83641,0.486667,0.627764,0.505271,0.70094,0.730664,...,0.516328,0.697228,0.565324,0.732787,0.788135,0.572222,0.769531,0.533639,0.596627,0.602369
Recall,0.621905,0.758621,0.625,0.670076,0.719231,0.520879,0.61339,0.536364,0.665927,0.723011,...,0.576471,0.696552,0.515692,0.785747,0.792414,0.611765,0.72,0.544709,0.616176,0.579221


In [87]:
land_use_cols = ["label_"+l for l in land_use]
res_land_use = res[land_use_cols]

In [88]:
small_blocks_cols = ["label_"+l for l in small_blocks]
res_small_blocks = res[small_blocks_cols]

In [93]:
age_buildings_cols = ["label_"+l for l in age_buildings]
res_age_buildings = res[age_buildings_cols]

In [94]:
concentration_cols = ["label_"+l for l in concentration]
res_concentration = res[concentration_cols]

In [95]:
vacuums_cols = ["label_"+l for l in vacuums]
res_vacuums = res[vacuums_cols]

In [99]:
for out_cat_name in ["land_use", "small_blocks", \
                     "age_buildings", "concentration",
                    "vacuums"]:
    eval("res_" + out_cat_name).to_csv(res_dir + "/res_" + out_cat_name+\
                                       ".csv", float_format='%.3f')