In [5]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [27]:
data_dir = "../../preprocessed/" 
features_dir = data_dir + "features/features_all/"
labels_dir = data_dir + "labels/" 

In [28]:
label_columns = ["hType_mix", "num_intersect", "bld_avg_age", "emp_rat_num",\
				"LUM5_single",	"RNR_nres", "mdist_smallparks", "nig_rat_daily",\
				"nig_rat_daily3", "mdist_nres_daily", "num_community_places", \
				"num_community_places_poi", "avg_block_area", "sphi", \
				"enterprises_empl_size", "pop_rat_num",  \
				"emp_rat_pop", "bld_rat_area", "den_nres_daily",\
				"mdist_parks", "den_nres_non-daily", "mdist_railways",\
				"mdist_highways", "mdist_water", "activity_density"]

In [35]:
PCA_components = 32
features_columns = ["PCA"+str(i) for i in range(PCA_components)]

In [44]:
def read_labels_features(city_name = None):
       
    df = pd.read_csv(features_dir + "Italy_6_cities_resnet_pca32_labels_features.csv")
    
    if city_name != None:
        df = df[df["city"] == city_name]

    df["city_image"] = df.\
        apply(lambda x: x.city + "_" + x.imageName, axis = 1)
    
    del df['imageName']
    del df['city']
    del df['index']

    
    return df

In [47]:
data = read_labels_features(city_name='milano')

In [48]:
data.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,label_emp_rat_pop,label_bld_rat_area,label_den_nres_daily,label_mdist_parks,label_den_nres_non-daily,label_mdist_railways,label_mdist_highways,label_mdist_water,label_activity_density,city_image
0,-1.119373,0.435411,-1.201175,0.483674,-0.031733,0.311695,-1.01403,-0.116001,-0.089904,-1.239161,...,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
1,-0.360178,0.131368,0.892438,-1.288809,2.380163,-1.839806,-0.538357,-0.932602,0.891328,1.117245,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
2,-0.446654,0.852468,0.423353,-0.102622,0.158605,1.469799,-0.871201,1.378634,0.422033,0.121623,...,0.0,2.0,2.0,0.0,2.0,1.0,1.0,2.0,2.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
3,-0.562409,-1.427899,2.248952,0.571678,0.026979,0.175677,1.053897,-0.336071,1.201596,-0.728358,...,1.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...
4,1.039437,-0.349472,-2.1205,0.571991,0.992104,0.871473,1.199311,-0.800286,0.344834,-0.628143,...,0.0,2.0,1.0,1.0,2.0,1.0,0.0,1.0,2.0,milano_S2B_MSIL2A_20181024T102059_N0209_R065_T...


In [49]:
def predict_label_i(label="label_hType_mix"):
    
    target = data[["city_image", label]]
    features = data[features_columns]
    
    X = features.values
    y = target[label].values
    
    X = sm.add_constant(X)  
    model = sm.OLS(y,X)
    
    results = model.fit()
    
    print ("LABEL ", label)
    print_model = results.summary()
    print (print_model)
    

In [50]:
predict_label_i()

LABEL  label_hType_mix
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.9059
Date:                Tue, 24 Mar 2020   Prob (F-statistic):              0.618
Time:                        11:47:13   Log-Likelihood:                -370.62
No. Observations:                 327   AIC:                             807.2
Df Residuals:                     294   BIC:                             932.3
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.2477      0.

In [51]:
for col in label_columns:
    label = "label_" + col
    print (label)
    predict_label_i(label)

label_hType_mix
LABEL  label_hType_mix
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.9059
Date:                Tue, 24 Mar 2020   Prob (F-statistic):              0.618
Time:                        11:47:15   Log-Likelihood:                -370.62
No. Observations:                 327   AIC:                             807.2
Df Residuals:                     294   BIC:                             932.3
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        

LABEL  label_sphi
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.8869
Date:                Tue, 24 Mar 2020   Prob (F-statistic):              0.647
Time:                        11:47:15   Log-Likelihood:                -385.35
No. Observations:                 327   AIC:                             836.7
Df Residuals:                     294   BIC:                             961.8
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8502      0.046  