In [116]:
import pandas as pd
import statsmodels.api as sm

In [117]:
data_dir = "../../preprocessed/" 
features_dir = data_dir + "district_features/regression/"
# labels_dir = data_dir + "regression_labels/" 

In [136]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

In [153]:
PCA_components = 16

LABELING_METHOD = "threshold"
AVERAGING_METHOD = "kaist"

USE_GEO = "GEO"

if USE_GEO == "GEO":
    features_columns = ["PCA"+str(i) for i in range(PCA_components)] + \
                        ["centroid_x", "centroid_y"]
else:
    features_columns = ["PCA"+str(i) for i in range(PCA_components)]
        

network_type = "vgg16_4096"

LABELING_METHOD = "threshold"

In [154]:
def get_normalized_labels_features():
    if network_type == "vgg19":
        df = pd.read_csv(features_dir + "Italy_6_cities_vgg19_pca"+str(PCA_components)+"_linear_fc_thirdlast_layer_labels_features.csv")
    elif network_type == "resnet50":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca"+str(PCA_components)+"_second_last_layer_labels_features.csv")
    elif network_type == "vgg16_4096":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096_"\
                         + LABELING_METHOD + "_" + AVERAGING_METHOD + "_labels_features.csv")

    df["label_district"] = df["label_district"].astype(str)
    df["city_district"] = df.\
        apply(lambda x: x.city + "_" + x.label_district, axis = 1)
    try:
        del df['index']
    except:
        pass
    return df

In [155]:
data = get_normalized_labels_features()

In [156]:
if AVERAGING_METHOD == "kaist":
    features_columns = [c for c in data.columns if "f_" in c]

In [157]:
data.head()

Unnamed: 0,label_district,f_std_PCA0,f_std_PCA1,f_std_PCA2,f_std_PCA3,f_std_PCA4,f_std_PCA5,f_std_PCA6,f_std_PCA7,f_std_PCA8,...,label_emp_rat_pop,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density,city,city_district
0,1,0.064958,0.073484,0.076522,0.129595,0.115562,0.0887,0.082998,0.125374,0.054396,...,0.0,0.824691,0.388441,1.0,0.303275,0.062985,0.35261,0.985182,milano,milano_1
1,2,0.070983,0.174323,0.130305,0.091274,0.140456,0.119589,0.194677,0.087679,0.116776,...,0.197653,0.900559,0.723512,0.878995,0.356367,0.00115,0.203797,1.0,milano,milano_2
2,3,0.10808,0.149888,0.061392,0.112375,0.099343,0.100533,0.207426,0.075992,0.038479,...,0.326944,0.789503,0.322766,0.616451,0.537791,0.0,0.265443,0.86486,milano,milano_3
3,4,0.101268,0.184833,0.052885,0.050209,0.071602,0.035432,0.064828,0.221197,0.141103,...,0.424251,0.709455,0.154357,0.405842,0.626548,0.067999,0.251033,0.660278,milano,milano_4
4,5,0.144261,0.089766,0.062477,0.168698,0.199859,0.272118,0.176438,0.116521,0.086539,...,0.222045,0.74433,0.417627,0.63934,0.579227,0.027418,0.387328,0.839758,milano,milano_5


In [158]:
len(features_columns)

33

In [159]:
def predict_label_i(city='all', label="label_activity_density"):
    
    if city == 'all':
        data2 = data.copy()
    else:
        data2 = data[data["city"] == city]
    
#     data2 = data2.drop(columns=["label_bld_rat_area"])
#     data2 = data2.dropna()
    
    target = data2[["city_district", label]]
    features = data2[features_columns]
    
    X = features.values
    y = target[label].values
    
    X = sm.add_constant(X)  
    model = sm.OLS(y,X, missing='drop')
    
    results = model.fit()
    
    print ("LABEL ", label)
    print_model = results.summary()
    print (print_model)
    
    return results.rsquared_adj

In [160]:
predict_label_i(city='milano')

LABEL  label_activity_density
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.747
Model:                            OLS   Adj. R-squared:                  0.580
Method:                 Least Squares   F-statistic:                     4.466
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           1.08e-06
Time:                        12:33:32   Log-Likelihood:                 68.726
No. Observations:                  84   AIC:                            -69.45
Df Residuals:                      50   BIC:                             13.20
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.6396 

0.5795036140530696

In [161]:
len(data[data["city"] == "bologna"])

23

In [162]:
cities = ["milano", "roma", "bologna", "firenze", "torino", "palermo", "all"]
res_sat = {}
for city in cities:
    res_sat[city] = predict_label_i(city=city, label="label_activity_density")

LABEL  label_activity_density
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.747
Model:                            OLS   Adj. R-squared:                  0.580
Method:                 Least Squares   F-statistic:                     4.466
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           1.08e-06
Time:                        12:33:35   Log-Likelihood:                 68.726
No. Observations:                  84   AIC:                            -69.45
Df Residuals:                      50   BIC:                             13.20
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.6396 

In [163]:
res_sat

{'milano': 0.5795036140530696,
 'roma': 0.5653668757872581,
 'bologna': nan,
 'firenze': nan,
 'torino': 0.6467864634907918,
 'palermo': 0.6984093827446913,
 'all': 0.4866218548454234}

In [148]:
def baseline_activity_pop_density(city="all", label="label_activity_density"):
    
    if city == 'all':
        data2 = data.copy()
    else:
        data2 = data[data["city"] == city]
        
    target = data2[["city_district", label]]
    features = data2['label_pop_rat_num']
    
    X = features.values
    y = target[label].values
    
    X = sm.add_constant(X)  
    model = sm.OLS(y,X)
    
    results = model.fit()
    
    print ("LABEL ", label)
    print_model = results.summary()
    print (print_model)
    
    return results.rsquared_adj

In [149]:
baseline_activity_pop_density(label="label_activity_density")

LABEL  label_activity_density
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.384
Model:                            OLS   Adj. R-squared:                  0.382
Method:                 Least Squares   F-statistic:                     229.0
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           1.43e-40
Time:                        12:28:13   Log-Likelihood:                 91.751
No. Observations:                 370   AIC:                            -179.5
Df Residuals:                     368   BIC:                            -171.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1498 

0.38194846785674175

In [150]:
cities = ["milano", "roma", "bologna", "firenze", "torino", "palermo", "all"]
res = {}
for city in cities:
    res[city] = baseline_activity_pop_density(city=city, label="label_activity_density")

LABEL  label_activity_density
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.327
Model:                            OLS   Adj. R-squared:                  0.318
Method:                 Least Squares   F-statistic:                     39.77
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           1.37e-08
Time:                        12:28:13   Log-Likelihood:                 27.660
No. Observations:                  84   AIC:                            -51.32
Df Residuals:                      82   BIC:                            -46.46
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2375 

In [151]:
res

{'milano': 0.31836817997278966,
 'roma': 0.3577524019400592,
 'bologna': 0.6194773064285527,
 'firenze': 0.1193522875706634,
 'torino': 0.4092227693431335,
 'palermo': 0.4770672249276323,
 'all': 0.38194846785674175}

In [152]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(city='milano', label=label)

label_hType_mix
LABEL  label_hType_mix
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.483
Model:                            OLS   Adj. R-squared:                  0.350
Method:                 Least Squares   F-statistic:                     3.624
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           8.35e-05
Time:                        12:28:14   Log-Likelihood:                 21.955
No. Observations:                  84   AIC:                            -7.911
Df Residuals:                      66   BIC:                             35.84
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        

LABEL  label_den_nres_non-daily
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.716
Model:                            OLS   Adj. R-squared:                  0.643
Method:                 Least Squares   F-statistic:                     9.783
Date:                Sat, 04 Apr 2020   Prob (F-statistic):           3.51e-12
Time:                        12:28:14   Log-Likelihood:                 50.287
No. Observations:                  84   AIC:                            -64.57
Df Residuals:                      66   BIC:                            -20.82
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.673