# BASELINE

In [49]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import os
from pathlib import Path
import xgboost as xgb

In [120]:
CITY = "torino"

data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/features_all/"
separate_features_dir = data_dir + "features/features_separate_cities/"

labels_dir = data_dir + "labels/" 

standardize_features = True

PCA_components = 64


USE_GEO = "GEO"

network_type = "vgg16_4096"

standardize_features = True

# feature_size = 2048

cities = ['milano', 'bologna', 'firenze', 'palermo', 'torino', 'all']

In [121]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

In [122]:
land_use = [
"LUM5_single","RNR_nres","mdist_smallparks",
"hType_mix", "nig_rat_daily", "mdist_nres_daily",
"num_community_places", "num_community_places_poi"]


small_blocks = [
"avg_block_area","num_intersect", "sphi"]


age_buildings = [
"bld_avg_age","enterprises_empl_size"]

concentration = [
"pop_rat_num","emp_rat_num","emp_rat_pop",
"bld_rat_area","den_nres_daily","den_nres_non-daily"]

vacuums = [
"mdist_parks", "mdist_railways",
"mdist_highways", "mdist_water"]

## Functions

In [118]:
if network_type == "vgg19":
	features_file = "Italy_6_cities_vgg19_pca"+str(PCA_components)+"_linear_fc_thirdlast_layer.csv"
elif network_type == "resnet50":
	features_file = "Italy_6_cities_resnet_pca"+str(PCA_components)+"_second_last_layer.csv"
elif network_type == "vgg16_4096":
	features_file = "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096.csv"

In [124]:
def get_normalized_labels_features(city_name=CITY):
    
    if city_name == "all":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096_labels_features.csv")
    else:

        df = pd.read_csv(separate_features_dir + \
            features_file.replace(".csv", "_" + city_name + "_labels_features.csv"))

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)

    del df['imageName']
    del df['city']
    del df['index']
    return df

In [125]:
def baseline(city_name=CITY, label="label_activity_density"):
    
    data = get_normalized_labels_features(city_name)
    
    kf = StratifiedKFold(n_splits=5)
    
    data2 = data.copy()
    data3 = data2[data2[label] != 1 ].copy()

    target = data3[label].apply(lambda x: int(x) if x == 0 else 1)
    
    features = data3["label_pop_rat_num"].apply(lambda x: int(x) if x == 0 else 1)
    
    X = np.array(features.values).reshape(-1, 1)
    y = target
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    acc = []
    auc = []
    P = []
    R = []
    
#     print (y_resampled)
#     print (sum(y_resampled),len(y_resampled))
    

    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]

        predictions = X_test
        accuracy = accuracy_score(y_test, predictions.round())
        precision=precision_score(y_test, predictions.round())
        recall=recall_score(y_test, predictions.round())
        roc=roc_auc_score(y_test,predictions)
        
#         print (predictions)
#         print (predictions.round())
#         print (y_test)
#         print (sum(y_test), len(y_test))
        
        acc.append(accuracy)
        auc.append(roc)
        P.append(precision)
        R.append(recall)
        
    return  ( {"Accuracy": (np.mean(acc), np.std(acc)),
            "Precision" : (np.mean(P), np.std(P)),
            "Recall": (np.mean(R), np.std(R)),
            "AUC": (np.mean(auc), np.std(auc))  },
            {"Accuracy": np.mean(acc),
            "Precision" : np.mean(P),
            "Recall": np.mean(R),
            "AUC": np.mean(auc) })

In [126]:
baseline(CITY, label="label_activity_density")

({'Accuracy': (0.8918128654970762, 0.05935024307071474),
  'Precision': (0.9127777777777778, 0.07737195714912747),
  'Recall': (0.8711111111111111, 0.07875575620741931),
  'AUC': (0.8933333333333333, 0.05904591224705078)},
 {'Accuracy': 0.8918128654970762,
  'Precision': 0.9127777777777778,
  'Recall': 0.8711111111111111,
  'AUC': 0.8933333333333333})

## Calculated Baselines

In [133]:
baseline("palermo", label="label_activity_density")

({'Accuracy': (0.9346153846153846, 0.03278644064754981),
  'Precision': (0.8892857142857142, 0.05578749768504755),
  'Recall': (1.0, 0.0),
  'AUC': (0.9333333333333333, 0.033333333333333305)},
 {'Accuracy': 0.9346153846153846,
  'Precision': 0.8892857142857142,
  'Recall': 1.0,
  'AUC': 0.9333333333333333})

In [128]:
kfold_SCORES = {}
kfold_SCORES2 = {}
for city in cities:

    (res1, res2) = baseline(city, label="label_activity_density")
    kfold_SCORES[city] = res1
    kfold_SCORES2[city] = res2

In [129]:
kfold_SCORES2

{'milano': {'Accuracy': 0.8529100529100528,
  'Precision': 0.8183823529411764,
  'Recall': 0.9274725274725274,
  'AUC': 0.8521978021978022},
 'bologna': {'Accuracy': 0.7833333333333333,
  'Precision': 0.8666666666666666,
  'Recall': 0.6666666666666667,
  'AUC': 0.7833333333333334},
 'firenze': {'Accuracy': 0.6904411764705882,
  'Precision': 0.6783116883116883,
  'Recall': 0.7305555555555555,
  'AUC': 0.6888888888888889},
 'palermo': {'Accuracy': 0.9346153846153846,
  'Precision': 0.8892857142857142,
  'Recall': 1.0,
  'AUC': 0.9333333333333333},
 'torino': {'Accuracy': 0.8918128654970762,
  'Precision': 0.9127777777777778,
  'Recall': 0.8711111111111111,
  'AUC': 0.8933333333333333},
 'all': {'Accuracy': 0.8269981288425555,
  'Precision': 0.8095988897231134,
  'Recall': 0.8526427061310782,
  'AUC': 0.827061310782241}}

In [130]:
res = pd.DataFrame(kfold_SCORES2)

In [131]:
res

Unnamed: 0,milano,bologna,firenze,palermo,torino,all
AUC,0.852198,0.783333,0.688889,0.933333,0.893333,0.827061
Accuracy,0.85291,0.783333,0.690441,0.934615,0.891813,0.826998
Precision,0.818382,0.866667,0.678312,0.889286,0.912778,0.809599
Recall,0.927473,0.666667,0.730556,1.0,0.871111,0.852643


In [132]:
out_name = '../../results/baseline/vitality_pop_density.csv'

res.to_csv(out_name, float_format='%.3f')