In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
data_dir = "../../preprocessed/" 
features_dir = data_dir + "regression_features/features_all/"
# labels_dir = data_dir + "regression_labels/" 

In [3]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

In [4]:
PCA_components = 64

USE_GEO = "GEO"

if USE_GEO == "GEO":
    features_columns = ["PCA"+str(i) for i in range(PCA_components)] + \
                        ["centroid_x", "centroid_y"]
else:
    features_columns = ["PCA"+str(i) for i in range(PCA_components)]
        

network_type = "vgg16_4096"

In [5]:
def get_normalized_labels_features():
    if network_type == "vgg19":
        df = pd.read_csv(features_dir + "Italy_6_cities_vgg19_pca"+str(PCA_components)+"_linear_fc_thirdlast_layer_labels_features.csv")
    elif network_type == "resnet50":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca"+str(PCA_components)+"_second_last_layer_labels_features.csv")
    elif network_type == "vgg16_4096":
        df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca" + str(PCA_components) + "_vgg16_4096_labels_features.csv")

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)
    
    del df['imageName']
    del df['city']
    del df['index']
    return df

In [6]:
data = get_normalized_labels_features()

In [7]:
data.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density,city_image
0,0.439129,0.613818,0.105353,0.671947,0.584087,0.333288,0.232843,0.007876,0.301698,0.621144,...,0.680503,0.543631,0.715952,0.160769,0.36838,0.907464,0.314818,0.041176,0.659161,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
1,0.039299,0.296719,0.725656,0.397496,0.225069,0.061125,0.57585,0.442814,0.378938,0.44851,...,0.550964,0.039828,0.138941,0.410288,0.010717,0.068859,0.286644,0.264237,0.31758,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
2,0.537519,0.292588,0.265063,0.141354,0.428936,0.845432,0.354047,0.428587,0.796385,0.574373,...,0.519877,0.866162,0.794297,0.137891,0.807486,0.364515,0.184614,0.620406,0.882345,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
3,0.327497,0.338799,0.61574,0.096609,0.275332,0.704205,0.621657,0.604031,0.601702,0.531715,...,0.629492,0.1688,0.120473,0.425658,0.097738,0.278395,0.977022,0.808904,0.325979,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
4,0.929802,0.278382,0.297208,0.564842,0.747615,1.0,0.339163,0.240381,0.396706,0.210041,...,0.222045,0.677125,0.74433,0.417627,0.63934,0.579227,0.027418,0.387328,0.839758,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...


In [8]:
def predict_label_i(label="label_activity_density"):
    
    target = data[["city_image", label]]
    features = data[features_columns]
    
    X = features.values
    y = target[label].values
    
    X = sm.add_constant(X)  
    model = sm.OLS(y,X)
    
    results = model.fit()
    
    print ("LABEL ", label)
    print_model = results.summary()
    print (print_model)
    

In [9]:
predict_label_i()

LABEL  label_activity_density
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.409
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     10.51
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           3.96e-76
Time:                        19:55:20   Log-Likelihood:                 233.71
No. Observations:                1069   AIC:                            -333.4
Df Residuals:                    1002   BIC:                           -0.1376
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4299 

In [10]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
LABEL  label_hType_mix
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.290
Model:                            OLS   Adj. R-squared:                  0.244
Method:                 Least Squares   F-statistic:                     6.210
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           5.01e-41
Time:                        19:55:24   Log-Likelihood:                 188.03
No. Observations:                1069   AIC:                            -242.1
Df Residuals:                    1002   BIC:                             91.22
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        

LABEL  label_mdist_smallparks
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.306
Model:                            OLS   Adj. R-squared:                  0.261
Method:                 Least Squares   F-statistic:                     6.708
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           2.74e-45
Time:                        19:55:24   Log-Likelihood:                 228.86
No. Observations:                1069   AIC:                            -323.7
Df Residuals:                    1002   BIC:                             9.578
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7410 

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.438
Model:                            OLS   Adj. R-squared:                  0.401
Method:                 Least Squares   F-statistic:                     11.84
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           3.32e-86
Time:                        19:55:24   Log-Likelihood:                 253.40
No. Observations:                1069   AIC:                            -372.8
Df Residuals:                    1002   BIC:                            -39.51
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7439      0.202      3.674      0.0

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.472
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     13.58
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           9.94e-99
Time:                        19:55:25   Log-Likelihood:                 194.45
No. Observations:                1069   AIC:                            -254.9
Df Residuals:                    1002   BIC:                             78.39
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0640      0.214      0.299      0.7

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.409
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     10.51
Date:                Tue, 31 Mar 2020   Prob (F-statistic):           3.96e-76
Time:                        19:55:25   Log-Likelihood:                 233.71
No. Observations:                1069   AIC:                            -333.4
Df Residuals:                    1002   BIC:                           -0.1376
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4299      0.206      2.084      0.0