# XGBoost ALL

In [1]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from pathlib import Path
import xgboost as xgb

In [2]:
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/features_all/"
labels_dir = data_dir + "labels/" 

standardize_features = True

PCA_components = 64

USE_GEO = "GEO"

if USE_GEO == "GEO":
    features_columns = ["PCA"+str(i) for i in range(PCA_components)] + \
                        ["centroid_x", "centroid_y"]
else:
    features_columns = ["PCA"+str(i) for i in range(PCA_components)]
        


network_type = "vgg16_4096"



standardize_features = True

# feature_size = 2048

In [3]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"] # "bld_rat_area", 

In [4]:
land_use = [
"LUM5_single","RNR_nres","mdist_smallparks",
"hType_mix", "nig_rat_daily", "mdist_nres_daily",
"num_community_places", "num_community_places_poi"]


small_blocks = [
"avg_block_area","num_intersect", "sphi"]


age_buildings = [
"bld_avg_age","enterprises_empl_size"]

concentration = [
"pop_rat_num","emp_rat_num","emp_rat_pop"
    ,"den_nres_daily","den_nres_non-daily"]

vacuums = [
"mdist_parks", "mdist_railways",
"mdist_highways", "mdist_water"]



## Functions

In [5]:
def get_normalized_labels_features():
    if network_type == "vgg19":
        df = pd.read_csv(features_dir + "Italy_6_cities_vgg19_pca"+str(PCA_components)+"_linear_fc_thirdlast_layer_labels_features.csv")
    elif network_type == "resnet50":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca"+str(PCA_components)+"_second_last_layer_labels_features.csv")
    elif network_type == "vgg16_4096":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096_labels_features.csv")

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)
    
    del df['imageName']
    del df['city']
    del df['index']
    return df

In [6]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["city_image", label]]
    target[label] = target[label].apply(lambda x: int(x) if x == 0 else 1)
    
    features = data2[features_columns]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    clf = xgb.XGBClassifier().fit(X_resampled, y_resampled)
    
    print(clf.score(X_resampled, y_resampled), "Low", len(low), "High", len(high), "NOW", len(X_resampled)/2)

In [7]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    data3 = data2[data2[label] != 1 ].copy()
    

    target = data3[label].apply(lambda x: int(x) if x == 0 else 1)
    
    
    features = data3[features_columns]
    
    X = features.values
    y = target
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    acc = []
    auc = []
    P = []
    R = []
    
#     print (y_resampled)
#     print (sum(y_resampled),len(y_resampled))
    

    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        param_dist = {'objective':'binary:logistic', 'n_estimators':16}
        clf = xgb.XGBModel(**param_dist)
        
        clf.fit(X_train, y_train,
#                 eval_metric='auc',
                verbose=False)
        
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions.round())
        precision=precision_score(y_test, predictions.round())
        recall=recall_score(y_test, predictions.round())
        roc=roc_auc_score(y_test,predictions)
        
#         print (predictions)
#         print (predictions.round())
#         print (y_test)
#         print (sum(y_test), len(y_test))
        
        acc.append(accuracy)
        auc.append(roc)
        P.append(precision)
        R.append(recall)
        
    return  ( {"Accuracy": (np.mean(acc), np.std(acc)),
            "Precision" : (np.mean(P), np.std(P)),
            "Recall": (np.mean(R), np.std(R)),
            "AUC": (np.mean(auc), np.std(auc))  },
            {"Accuracy": np.mean(acc),
            "Precision" : np.mean(P),
            "Recall": np.mean(R),
            "AUC": np.mean(auc) })

## Read in Data. Chose standardized or not.

In [8]:
data = get_normalized_labels_features()

## Predict K-Fold

In [9]:
predict_label_i_KFold(label="label_hType_mix")

({'Accuracy': (0.688126159554731, 0.03768520729574099),
  'Precision': (0.6788409361637113, 0.04917816525830776),
  'Recall': (0.7290612244897959, 0.0705432717285811),
  'AUC': (0.7396151603498542, 0.05626567283763688)},
 {'Accuracy': 0.688126159554731,
  'Precision': 0.6788409361637113,
  'Recall': 0.7290612244897959,
  'AUC': 0.7396151603498542})

In [10]:
kfold_SCORES = {}
kfold_SCORES2 = {}
for col in label_columns:
    label = "label_" + col
    (res1, res2) = predict_label_i_KFold(label)
    kfold_SCORES[label] = res1
    kfold_SCORES2[label] = res2

In [11]:
kfold_SCORES2

{'label_hType_mix': {'Accuracy': 0.688126159554731,
  'Precision': 0.6788409361637113,
  'Recall': 0.7290612244897959,
  'AUC': 0.7396151603498542},
 'label_num_intersect': {'Accuracy': 0.8631701631701633,
  'Precision': 0.8760989010989011,
  'Recall': 0.8448043184885291,
  'AUC': 0.9393231599709331},
 'label_bld_avg_age': {'Accuracy': 0.8219298245614036,
  'Precision': 0.8269358974358975,
  'Recall': 0.8078014184397164,
  'AUC': 0.8795970005910165},
 'label_emp_rat_num': {'Accuracy': 0.8663458968190323,
  'Precision': 0.8928442971756926,
  'Recall': 0.8345665961945032,
  'AUC': 0.9386769261025616},
 'label_LUM5_single': {'Accuracy': 0.5861589566929133,
  'Precision': 0.5804067687133644,
  'Recall': 0.6393353174603175,
  'AUC': 0.6272716703869048},
 'label_RNR_nres': {'Accuracy': 0.6160153256704981,
  'Precision': 0.6112233700084333,
  'Recall': 0.6821537290715373,
  'AUC': 0.6645421106037543},
 'label_mdist_smallparks': {'Accuracy': 0.681790556334736,
  'Precision': 0.6806577480490524

In [12]:
res = pd.DataFrame(kfold_SCORES2)

In [13]:
res

Unnamed: 0,label_hType_mix,label_num_intersect,label_bld_avg_age,label_emp_rat_num,label_LUM5_single,label_RNR_nres,label_mdist_smallparks,label_nig_rat_daily,label_nig_rat_daily3,label_mdist_nres_daily,...,label_enterprises_empl_size,label_pop_rat_num,label_emp_rat_pop,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density
AUC,0.739615,0.939323,0.879597,0.938677,0.627272,0.664542,0.755798,0.714207,0.948173,0.92141,...,0.707001,0.841716,0.714745,0.913712,0.570972,0.934537,0.727704,0.787895,0.747421,0.954816
Accuracy,0.688126,0.86317,0.82193,0.866346,0.586159,0.616015,0.681791,0.661016,0.904022,0.830949,...,0.646995,0.761412,0.645004,0.852632,0.550725,0.874493,0.66342,0.718553,0.667179,0.877867
Precision,0.678841,0.876099,0.826936,0.892844,0.580407,0.611223,0.680658,0.66838,0.913488,0.856673,...,0.645549,0.775334,0.668783,0.863439,0.554896,0.876696,0.673781,0.695874,0.669575,0.910286
Recall,0.729061,0.844804,0.807801,0.834567,0.639335,0.682154,0.688529,0.66115,0.891805,0.798462,...,0.670069,0.738161,0.580278,0.836842,0.524638,0.879791,0.637377,0.782986,0.671779,0.839112


In [14]:
if standardize_features:
    out_name = '../../results/XGBoost/ALL_XGBoost' +str(PCA_components)\
            + '_' + network_type + '_' + USE_GEO + '_standardized7s.csv'
else:
    out_name = '../../results/XGBoost/ALL_XGBoost' +str(PCA_components)\
       + '_' + network_type + '_' + USE_GEO +   '7s.csv'
res.to_csv(out_name, float_format='%.3f')

In [15]:
res_dir = '../../results/XGBoost/ALL_' + network_type + '_' + USE_GEO +"_" + str(PCA_components)
Path(res_dir).mkdir(parents=True, exist_ok=True)

In [16]:
land_use_cols = ["label_"+l for l in land_use]
res_land_use = res[land_use_cols]

In [17]:
small_blocks_cols = ["label_"+l for l in small_blocks]
res_small_blocks = res[small_blocks_cols]

In [18]:
age_buildings_cols = ["label_"+l for l in age_buildings]
res_age_buildings = res[age_buildings_cols]

In [19]:
concentration_cols = ["label_"+l for l in concentration]
res_concentration = res[concentration_cols]

In [20]:
vacuums_cols = ["label_"+l for l in vacuums]
res_vacuums = res[vacuums_cols]

In [21]:
for out_cat_name in ["land_use", "small_blocks", \
                     "age_buildings", "concentration",
                    "vacuums"]:
    eval("res_" + out_cat_name).to_csv(res_dir + "/res_" + out_cat_name+\
                                       ".csv", float_format='%.3f')