In [65]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [66]:
CITY = "milano"
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/" + CITY + "/"
labels_dir = data_dir + "labels/" + CITY + "/"

In [67]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water"]

In [68]:
def create_labels_features():
    
    labels = pd.read_csv(labels_dir + "imagelet_labels_clean.csv")
    features = pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca32.csv")
    
    
    data = pd.merge(features,labels, on="name", how="inner")
    
    return data

In [69]:
def create_normalized_labels_features():
    
    labels = pd.read_csv(labels_dir + "imagelet_labels_clean.csv")
    features = pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca32.csv")
    features2 = features.set_index("name")
    standardized_features = StandardScaler().fit_transform(features2)
    standardized_features_df = pd.DataFrame(standardized_features,columns=\
                                            ['PCA%i' % i for i in range(len(features2.columns))])
    features = pd.concat([features["name"], standardized_features_df], axis = 1)
    data = pd.merge(features,labels, on="name", how="inner")
    return data

In [70]:
data = create_labels_features()

In [71]:
data.head()

Unnamed: 0,name,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,...,label_pop_rat_num,label_emp_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
0,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-4.07112,-0.424936,-5.247612,-0.838083,1.208517,-4.789422,-0.859758,-1.78785,1.068934,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
1,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-3.412396,-2.291054,5.429623,3.404654,4.283522,-2.127759,-0.095113,0.226035,-1.662767,...,1.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0
2,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-2.962552,-2.288686,-0.662143,-1.82241,0.184249,-2.590495,1.456764,-1.762665,-1.093223,...,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0
3,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.820459,-2.467551,-1.155593,-2.351451,5.578507,1.944743,2.922524,-0.117414,-0.900532,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
4,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-1.636376,1.795233,-3.637547,1.518884,2.062779,-0.810377,-1.976423,1.42819,2.538521,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0


In [72]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["name", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X_orig = features.values
    y_orig = target[label].values
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    rus = RandomUnderSampler(random_state=0)
    X, y = rus.fit_sample(X_orig, y_orig)
    
    clf = SVC(gamma='auto', kernel = 'poly', degree = 2)
    clf.fit(X, y)
    
    print(clf.score(X, y), "Low", len(low), "High", len(high), "NOW", len(X)/2)

In [73]:
predict_label_i()

1.0 Low 71 High 152 NOW 71.0


In [74]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
1.0 Low 71 High 152 NOW 71.0
label_num_intersect
1.0 Low 181 High 65 NOW 65.0
label_bld_avg_age
1.0 Low 160 High 71 NOW 71.0
label_emp_rat_num
1.0 Low 161 High 70 NOW 70.0
label_LUM5_single
1.0 Low 61 High 165 NOW 61.0
label_RNR_nres
1.0 Low 67 High 141 NOW 67.0
label_mdist_smallparks
1.0 Low 132 High 87 NOW 87.0
label_nig_rat_daily
0.9954545454545455 Low 110 High 114 NOW 110.0
label_nig_rat_daily3
1.0 Low 167 High 70 NOW 70.0
label_mdist_nres_daily
1.0 Low 174 High 60 NOW 60.0
label_num_community_places
1.0 Low 177 High 61 NOW 61.0
label_num_community_places_poi
1.0 Low 142 High 67 NOW 67.0
label_avg_block_area
1.0 Low 74 High 171 NOW 74.0
label_sphi
1.0 Low 137 High 91 NOW 91.0
label_enterprises_empl_size
0.9882352941176471 Low 85 High 158 NOW 85.0
label_pop_rat_num
1.0 Low 181 High 45 NOW 45.0
label_emp_rat_pop
0.9848484848484849 Low 128 High 99 NOW 99.0
label_bld_rat_area
1.0 Low 177 High 66 NOW 66.0
label_den_nres_daily
1.0 Low 177 High 60 NOW 60.0
label_mdist_park

In [75]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["name", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]

    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    res = []
    
    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        clf = SVC(gamma='auto', kernel = 'poly', degree = 2)
        clf.fit(X_train, y_train)
        res.append(clf.score(X_test, y_test))
    return np.mean(res), np.std(res)

In [76]:
predict_label_i_KFold(label="label_hType_mix")

(0.5700000000000001, 0.058970764115857985)

In [77]:
kfold_SCORES = {}
for col in label_columns:
    label = "label_" + col
    kfold_SCORES[label] = predict_label_i_KFold(label)

In [78]:
kfold_SCORES

{'label_hType_mix': (0.5700000000000001, 0.058970764115857985),
 'label_num_intersect': (0.5923076923076923, 0.1076923076923077),
 'label_bld_avg_age': (0.5495238095238095, 0.05815375663214579),
 'label_emp_rat_num': (0.6214285714285714, 0.07693092581620722),
 'label_LUM5_single': (0.5711538461538461, 0.13319457770216123),
 'label_RNR_nres': (0.5164835164835164, 0.06816287844440218),
 'label_mdist_smallparks': (0.44803921568627453, 0.050694691319386455),
 'label_nig_rat_daily': (0.5909090909090909, 0.04767312946227959),
 'label_nig_rat_daily3': (0.5928571428571429, 0.06624013211068362),
 'label_mdist_nres_daily': (0.5166666666666666, 0.02041241452319313),
 'label_num_community_places': (0.6397435897435897, 0.09396220231300906),
 'label_num_community_places_poi': (0.44010989010989016, 0.07602299145682627),
 'label_avg_block_area': (0.6276190476190475, 0.04784707309363988),
 'label_sphi': (0.5824561403508772, 0.03518019004139187),
 'label_enterprises_empl_size': (0.488235294117647, 0.082