# CRF

In [38]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

In [39]:
CITY = "milano"
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/" + CITY + "/"
labels_dir = data_dir + "labels/" + CITY + "/"

PCA_components = 32

standardize_features = True

In [40]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water"]

## Functions

In [41]:
def get_labels_features():
    labels = pd.read_csv(labels_dir + "imagelet_labels_clean.csv")
    features = pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca"+str(PCA_components) +".csv")
    data = pd.merge(features,labels, on="name", how="inner")
    return data

In [42]:
def get_normalized_labels_features():
    return pd.read_csv(features_dir + "Resnet50/df_ResNet50_feat8192_pca"+\
            str(PCA_components) +"_labels_features.csv")

In [43]:
def predict_label_i(label="label_hType_mix"):
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["name", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    high = target[target[label] == 2]
    low = target[target[label] == 0]
    
    clf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100).fit(X_resampled, y_resampled)
    
    print(clf.score(X_resampled, y_resampled), "Low", len(low), "High", len(high), "NOW", len(X_resampled)/2)

In [44]:
def predict_label_i_KFold(label="label_hType_mix"):
    
    kf = StratifiedKFold(n_splits=5)
    
    
    data2 = data.copy()
    data2 = data2[data2[label] != 1 ]
    
    target = data2[["name", label]]
    features = data2[[c for c in data.columns if "PCA" in c]]
    
    X = features.values
    y = target[label].values
    
    rus = RandomUnderSampler(random_state=1)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    
    res = []
    
    kf.get_n_splits(X_resampled, y_resampled)
    for train_index, test_index in kf.split(X_resampled, y_resampled):
        
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
        clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10).fit(X_train, y_train)

        res.append(clf.score(X_test, y_test))
        
    return np.mean(res), np.std(res)

## Read in Data. Chose standardized or not.

In [45]:
if standardize_features:
    data = get_normalized_labels_features()
else:
    data = get_labels_features()

In [46]:
data.head()

Unnamed: 0,name,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,...,label_pop_rat_num,label_emp_rat_num,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water
0,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.408721,-0.104768,-1.554297,-0.299501,0.606426,-2.119831,-0.43689,-0.990207,0.602116,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
1,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.251129,-0.661239,1.873108,1.448112,1.914111,-0.911612,-0.070438,0.011526,-0.879663,...,1.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0
2,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,-0.143509,-0.660533,-0.082356,-0.704953,0.170844,-1.121664,0.67329,-0.97768,-0.57072,...,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0
3,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,0.368961,-0.71387,-0.240754,-0.922868,2.46482,0.937034,1.375748,-0.15931,-0.466197,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0
4,S2B_MSIL2A_20181024T102059_N0209_R065_T32TNR_2...,0.173763,0.557279,-1.037464,0.67135,0.969712,-0.313608,-0.972046,0.609494,1.399276,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0


## Predict on Train

In [47]:
predict_label_i()

1.0 Low 71 High 152 NOW 71.0


In [48]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
1.0 Low 71 High 152 NOW 71.0
label_num_intersect
1.0 Low 181 High 65 NOW 65.0
label_bld_avg_age
1.0 Low 160 High 71 NOW 71.0
label_emp_rat_num
1.0 Low 161 High 70 NOW 70.0
label_LUM5_single
1.0 Low 61 High 165 NOW 61.0
label_RNR_nres
1.0 Low 67 High 141 NOW 67.0
label_mdist_smallparks
1.0 Low 132 High 87 NOW 87.0
label_nig_rat_daily
1.0 Low 110 High 114 NOW 110.0
label_nig_rat_daily3
1.0 Low 167 High 70 NOW 70.0
label_mdist_nres_daily
1.0 Low 174 High 60 NOW 60.0
label_num_community_places
1.0 Low 177 High 61 NOW 61.0
label_num_community_places_poi
1.0 Low 142 High 67 NOW 67.0
label_avg_block_area
1.0 Low 74 High 171 NOW 74.0
label_sphi
1.0 Low 137 High 91 NOW 91.0
label_enterprises_empl_size
1.0 Low 85 High 158 NOW 85.0
label_pop_rat_num
1.0 Low 181 High 45 NOW 45.0
label_emp_rat_pop
1.0 Low 128 High 99 NOW 99.0
label_bld_rat_area
1.0 Low 177 High 66 NOW 66.0
label_den_nres_daily
1.0 Low 177 High 60 NOW 60.0
label_mdist_parks
1.0 Low 85 High 119 NOW 85.0
label_den_nres

## Predict K-Fold

In [49]:
predict_label_i_KFold(label="label_hType_mix")

(0.5066666666666666, 0.0993562042793412)

In [50]:
kfold_SCORES = {}
for col in label_columns:
    label = "label_" + col
    kfold_SCORES[label] = predict_label_i_KFold(label)

In [51]:
kfold_SCORES

{'label_hType_mix': (0.5066666666666666, 0.0993562042793412),
 'label_num_intersect': (0.5538461538461539, 0.05217176910096362),
 'label_bld_avg_age': (0.5357142857142857, 0.063887656499994),
 'label_emp_rat_num': (0.6, 0.09689042833036098),
 'label_LUM5_single': (0.6224358974358974, 0.0732678567293823),
 'label_RNR_nres': (0.49340659340659343, 0.06944846289813171),
 'label_mdist_smallparks': (0.4771241830065359, 0.02097621769396238),
 'label_nig_rat_daily': (0.46818181818181814, 0.08331955809010619),
 'label_nig_rat_daily3': (0.5928571428571427, 0.06624013211068362),
 'label_mdist_nres_daily': (0.5583333333333333, 0.08975274678557504),
 'label_num_community_places': (0.575, 0.07168604389202192),
 'label_num_community_places_poi': (0.49120879120879124, 0.07390875813081015),
 'label_avg_block_area': (0.5866666666666667, 0.07774602526460399),
 'label_sphi': (0.5824561403508773, 0.0017543859649122862),
 'label_enterprises_empl_size': (0.5117647058823529, 0.08843115516689945),
 'label_pop_