# Logistic Regression

In [57]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [80]:
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/features_all/"
labels_dir = data_dir + "labels/" 

PCA_components = 32
features_columns = ["PCA"+str(i) for i in range(PCA_components)]

network_type = "resnet50"

In [81]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"] # , "bld_rat_area"

## Functions

In [82]:
def read_labels_features(city_name = None):
    
    if network_type == "vgg19":
        df = pd.read_csv(features_dir + "Italy_6_cities_vgg19_pca32_linear_fc_thirdlast_layer_labels_features.csv")
    elif network_type == "resnet50":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca32_second_last_layer_labels_features.csv")
    
    if city_name != None:
        df = df[df["city"] == city_name]

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)
    
    del df['imageName']
    del df['city']
    del df['index']

    
    return df

In [83]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["city_image", label]]
    features = data2[features_columns]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    clf = LogisticRegression(random_state=0,solver='lbfgs').fit(X_resampled, y_resampled)
    print(clf.score(X, y), "Low", len(low), "High", len(high), "NOW", len(X_resampled)/2)

In [84]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["city_image", label]]
    features = data2[features_columns]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=777)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    res = []
    
    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        clf = LogisticRegression(random_state=0,solver='lbfgs').fit(X_train, y_train)
#         print(clf.score(X_test, y_test), "Train", len(X_train), "Test", len(X_test), "ALL", len(X_resampled)/2)
        res.append(clf.score(X_test, y_test))
        
    return np.mean(res), np.std(res)

## Read in Data. Chose standardized or not.

In [85]:
data = read_labels_features()
len(data)

1073

In [86]:
data.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density,city_image
0,-0.381931,-0.10974,0.968183,-0.421974,-0.226458,0.770286,-1.064367,-0.202837,-0.51991,0.436571,...,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
1,-0.208971,3.18608,-1.060092,2.806268,-0.589476,2.205192,0.212444,-0.374578,1.173077,-0.866599,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
2,-0.81148,0.160779,-0.639456,-0.99987,-0.231792,-0.076675,-0.140094,1.003483,-0.495559,-0.355126,...,0.0,2.0,2.0,0.0,2.0,1.0,1.0,2.0,2.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
3,1.09312,-0.551,1.219836,-0.359877,-0.724334,-0.959216,0.094147,0.723556,1.278228,-0.761471,...,1.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
4,1.737391,-2.177757,1.52776,-0.936889,-1.160909,-0.359742,-0.102321,-1.317503,-1.770631,-0.618591,...,0.0,2.0,1.0,1.0,2.0,1.0,0.0,1.0,2.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...


## Predict on Train

In [87]:
predict_label_i()

0.7069199457259159 Low 247 High 490 NOW 247.0


In [88]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
0.7069199457259159 Low 247 High 490 NOW 247.0
label_num_intersect
0.8473282442748091 Low 592 High 194 NOW 194.0
label_bld_avg_age
0.7645390070921986 Low 466 High 239 NOW 239.0
label_emp_rat_num
0.8083109919571045 Low 529 High 217 NOW 217.0
label_LUM5_single
0.6602475928473177 Low 319 High 408 NOW 319.0
label_RNR_nres
0.6413612565445026 Low 362 High 402 NOW 362.0
label_mdist_smallparks
0.6950261780104712 Low 533 High 231 NOW 231.0
label_nig_rat_daily
0.6706989247311828 Low 357 High 387 NOW 357.0
label_nig_rat_daily3
0.7971830985915493 Low 496 High 214 NOW 214.0
label_mdist_nres_daily
0.8145859085290482 Low 611 High 198 NOW 198.0
label_num_community_places
0.820480404551201 Low 592 High 199 NOW 199.0
label_num_community_places_poi
0.6019417475728155 Low 410 High 311 NOW 311.0
label_avg_block_area
0.8128205128205128 Low 213 High 567 NOW 213.0
label_sphi
0.7182910547396528 Low 503 High 246 NOW 246.0
label_enterprises_empl_size
0.6830530401034929 Low 306 High 467 NOW 306.0
l

## Predict K-Fold

In [89]:
predict_label_i_KFold(label="label_hType_mix")

(0.6780045351473923, 0.04731942062430861)

In [90]:
kfold_SCORES = {}
for col in label_columns:
    label = "label_" + col
    kfold_SCORES[label] = predict_label_i_KFold(label)

In [91]:
kfold_SCORES

{'label_hType_mix': (0.6780045351473923, 0.04731942062430861),
 'label_num_intersect': (0.8119214119214119, 0.03652523378055328),
 'label_bld_avg_age': (0.7176315789473684, 0.041270648040203535),
 'label_emp_rat_num': (0.7626035819299652, 0.04065162878274789),
 'label_LUM5_single': (0.5783956692913386, 0.07482352999211316),
 'label_RNR_nres': (0.5912931034482758, 0.06827259657812657),
 'label_mdist_smallparks': (0.6538569424964937, 0.05174227979880904),
 'label_nig_rat_daily': (0.6203880626415839, 0.040274406853132376),
 'label_nig_rat_daily3': (0.7263474692202463, 0.06350443899416124),
 'label_mdist_nres_daily': (0.7503164556962025, 0.06916131819612653),
 'label_num_community_places': (0.7837974683544304, 0.034731585733860856),
 'label_num_community_places_poi': (0.5531483870967742, 0.04165553005524169),
 'label_avg_block_area': (0.7818057455540355, 0.036793158588860786),
 'label_sphi': (0.6584827870542156, 0.04785262116841759),
 'label_enterprises_empl_size': (0.6242303078768493, 0.0

In [92]:
STANDARDIZED_res = kfold_SCORES

In [93]:
STANDARDIZED_res

{'label_hType_mix': (0.6780045351473923, 0.04731942062430861),
 'label_num_intersect': (0.8119214119214119, 0.03652523378055328),
 'label_bld_avg_age': (0.7176315789473684, 0.041270648040203535),
 'label_emp_rat_num': (0.7626035819299652, 0.04065162878274789),
 'label_LUM5_single': (0.5783956692913386, 0.07482352999211316),
 'label_RNR_nres': (0.5912931034482758, 0.06827259657812657),
 'label_mdist_smallparks': (0.6538569424964937, 0.05174227979880904),
 'label_nig_rat_daily': (0.6203880626415839, 0.040274406853132376),
 'label_nig_rat_daily3': (0.7263474692202463, 0.06350443899416124),
 'label_mdist_nres_daily': (0.7503164556962025, 0.06916131819612653),
 'label_num_community_places': (0.7837974683544304, 0.034731585733860856),
 'label_num_community_places_poi': (0.5531483870967742, 0.04165553005524169),
 'label_avg_block_area': (0.7818057455540355, 0.036793158588860786),
 'label_sphi': (0.6584827870542156, 0.04785262116841759),
 'label_enterprises_empl_size': (0.6242303078768493, 0.0