# Logistic Regression

In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [23]:
CITY = "milano"
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/" + CITY + "/"
labels_dir = data_dir + "labels/" + CITY + "/"

PCA_components = 32

standardize_features = True

feature_size = 2048

network_type='VGG'

In [24]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop",  "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

## Functions

In [25]:
def get_labels_features():
    labels = pd.read_csv(labels_dir + "imagelet_labels_clean.csv")
    features = pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca"+str(PCA_components) +".csv")
    data = pd.merge(features,labels, on="name", how="inner")
    return data

In [26]:
def get_normalized_labels_features(network_type='VGG'):
    if network_type == 'ResnNet':
        return pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca"+\
            str(PCA_components) +"_labels_features.csv")
    else:
        return pd.read_csv(features_dir+"VGG16/df_VGG16_feat2048_pca"+\
                           str(PCA_components) + "_labels_features.csv")

In [27]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["imageName", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    clf = LogisticRegression(random_state=0,solver='lbfgs').fit(X_resampled, y_resampled)
    print(clf.score(X, y), "Low", len(low), "High", len(high), "NOW", len(X_resampled)/2)

In [28]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["imageName", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=777)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    res = []
    
    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        clf = LogisticRegression(random_state=0,solver='lbfgs').fit(X_train, y_train)
#         print(clf.score(X_test, y_test), "Train", len(X_train), "Test", len(X_test), "ALL", len(X_resampled)/2)
        res.append(clf.score(X_test, y_test))
        
    return np.mean(res), np.std(res)

## Read in Data. Chose standardized or not.

In [29]:
if standardize_features:
    data = get_normalized_labels_features()
else:
    data = get_labels_features()

In [30]:
data.head()

Unnamed: 0,index,imageName,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,...,label_pop_rat_num,label_emp_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
0,0,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.513386,-0.352732,0.716721,0.75541,0.647604,-0.517665,0.458571,1.024984,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
1,1,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.773019,-0.161784,-0.752245,-0.568582,-0.140501,0.249631,-0.200838,-0.406981,...,1.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0
2,2,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.305169,0.001131,0.561944,2.024687,0.274702,0.433355,-0.114322,-0.317631,...,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0
3,3,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.011016,-0.778505,1.975514,-1.7861,-0.994235,1.492313,-0.930522,1.202427,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
4,4,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,0.414278,-0.994471,1.94381,-0.590098,-0.26674,0.700525,1.007925,-0.608827,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0


## Predict on Train

In [31]:
predict_label_i()

0.7713004484304933 Low 71 High 152 NOW 71.0


In [32]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
0.7713004484304933 Low 71 High 152 NOW 71.0
label_num_intersect
0.8821138211382114 Low 181 High 65 NOW 65.0
label_bld_avg_age
0.7835497835497836 Low 160 High 71 NOW 71.0
label_emp_rat_num
0.8268398268398268 Low 161 High 70 NOW 70.0
label_LUM5_single
0.8584070796460177 Low 61 High 165 NOW 61.0
label_RNR_nres
0.6923076923076923 Low 67 High 141 NOW 67.0
label_mdist_smallparks
0.6621004566210046 Low 132 High 87 NOW 87.0
label_nig_rat_daily
0.6607142857142857 Low 110 High 114 NOW 110.0
label_nig_rat_daily3
0.8396624472573839 Low 167 High 70 NOW 70.0
label_mdist_nres_daily
0.8376068376068376 Low 174 High 60 NOW 60.0
label_num_community_places
0.8487394957983193 Low 177 High 61 NOW 61.0
label_num_community_places_poi
0.6985645933014354 Low 142 High 67 NOW 67.0
label_avg_block_area
0.8571428571428571 Low 74 High 171 NOW 74.0
label_sphi
0.6842105263157895 Low 137 High 91 NOW 91.0
label_enterprises_empl_size
0.6707818930041153 Low 85 High 158 NOW 85.0
label_pop_rat_num
0.78761061

KeyError: 'label_activity_density'

## Predict K-Fold

In [33]:
predict_label_i_KFold(label="label_hType_mix")

(0.6470443349753695, 0.07796490579398885)

In [34]:
kfold_SCORES = {}
for col in label_columns:
    label = "label_" + col
    kfold_SCORES[label] = predict_label_i_KFold(label)

KeyError: 'label_activity_density'

In [35]:
kfold_SCORES

{'label_hType_mix': (0.6470443349753695, 0.07796490579398885),
 'label_num_intersect': (0.7923076923076923, 0.0521717691009636),
 'label_bld_avg_age': (0.6697044334975369, 0.07890004938040188),
 'label_emp_rat_num': (0.75, 0.06388765649999396),
 'label_LUM5_single': (0.8100000000000002, 0.07908504564356302),
 'label_RNR_nres': (0.5515669515669516, 0.10526053484848588),
 'label_mdist_smallparks': (0.5403361344537816, 0.04655189143465966),
 'label_nig_rat_daily': (0.5272727272727272, 0.03340213285613424),
 'label_nig_rat_daily3': (0.7428571428571429, 0.06546536707079768),
 'label_mdist_nres_daily': (0.725, 0.042491829279939865),
 'label_num_community_places': (0.8273333333333334, 0.0965781203655017),
 'label_num_community_places_poi': (0.5971509971509972, 0.04158373171628246),
 'label_avg_block_area': (0.7563218390804598, 0.07353807041415905),
 'label_sphi': (0.5493993993993994, 0.037309819796990655),
 'label_enterprises_empl_size': (0.5529411764705883, 0.07299808027053446),
 'label_pop_

In [36]:
STANDARDIZED_res = kfold_SCORES

In [37]:
STANDARDIZED_res

{'label_hType_mix': (0.6470443349753695, 0.07796490579398885),
 'label_num_intersect': (0.7923076923076923, 0.0521717691009636),
 'label_bld_avg_age': (0.6697044334975369, 0.07890004938040188),
 'label_emp_rat_num': (0.75, 0.06388765649999396),
 'label_LUM5_single': (0.8100000000000002, 0.07908504564356302),
 'label_RNR_nres': (0.5515669515669516, 0.10526053484848588),
 'label_mdist_smallparks': (0.5403361344537816, 0.04655189143465966),
 'label_nig_rat_daily': (0.5272727272727272, 0.03340213285613424),
 'label_nig_rat_daily3': (0.7428571428571429, 0.06546536707079768),
 'label_mdist_nres_daily': (0.725, 0.042491829279939865),
 'label_num_community_places': (0.8273333333333334, 0.0965781203655017),
 'label_num_community_places_poi': (0.5971509971509972, 0.04158373171628246),
 'label_avg_block_area': (0.7563218390804598, 0.07353807041415905),
 'label_sphi': (0.5493993993993994, 0.037309819796990655),
 'label_enterprises_empl_size': (0.5529411764705883, 0.07299808027053446),
 'label_pop_