In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# MAIN_GOAL: Predict green_roof_area

In [3]:
df = pd.read_csv('../Data/cleaned_data/green_roofs2.csv')

df

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,borough,xcoord,ycoord,area_type
0,971,14057,0.07,1900,59,90,1,BK,-73.93491,40.67389,public
1,696,4463,0.16,1900,13,21,0,MN,-73.99982,40.73481,commercial
2,293,13217,0.02,1900,93,7,0,MN,-74.00906,40.72480,commercial
3,759,4311,0.18,1900,99,21,0,MN,-74.00836,40.71595,commercial
4,7204,35891,0.20,1990,206,10,1,BX,-73.91227,40.81906,commercial
...,...,...,...,...,...,...,...,...,...,...,...
725,1525,6414,0.24,1800,104,6,0,MN,-74.00968,40.72554,residential
726,343,2941,0.12,1990,118,42,1,MN,-73.99311,40.72600,public
727,309,4350,0.07,1990,175,18,1,MN,-74.01044,40.71474,industrial
728,8139,20051,0.41,1960,23,12,0,MN,-73.93708,40.79703,commercial


In [4]:
df.columns

Index(['green_roof_area', 'building_area', 'ratio_green_area',
       'construction_year', 'roof_height', 'ground_elev', 'digitized',
       'borough', 'xcoord', 'ycoord', 'area_type'],
      dtype='object')

In [5]:
# to check the correlation we may want to order the columns so the target variable is at the end, and
    # the numericals and categoricals are together displayed

df = df[['building_area', 'ratio_green_area','roof_height', 'ground_elev',\
    'xcoord', 'ycoord','borough','construction_year','digitized','area_type', 'green_roof_area' ]]

df

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,borough,construction_year,digitized,area_type,green_roof_area
0,14057,0.07,59,90,-73.93491,40.67389,BK,1900,1,public,971
1,4463,0.16,13,21,-73.99982,40.73481,MN,1900,0,commercial,696
2,13217,0.02,93,7,-74.00906,40.72480,MN,1900,0,commercial,293
3,4311,0.18,99,21,-74.00836,40.71595,MN,1900,0,commercial,759
4,35891,0.20,206,10,-73.91227,40.81906,BX,1990,1,commercial,7204
...,...,...,...,...,...,...,...,...,...,...,...
725,6414,0.24,104,6,-74.00968,40.72554,MN,1800,0,residential,1525
726,2941,0.12,118,42,-73.99311,40.72600,MN,1990,1,public,343
727,4350,0.07,175,18,-74.01044,40.71474,MN,1990,1,industrial,309
728,20051,0.41,23,12,-73.93708,40.79703,MN,1960,0,commercial,8139


In [6]:
# first we want to check the correlation

df.corr()

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,construction_year,digitized,green_roof_area
building_area,1.0,-0.187432,0.322886,-0.091766,0.089432,-0.031258,0.229547,0.089102,0.564244
ratio_green_area,-0.187432,1.0,-0.267895,0.107822,0.130734,0.124983,-0.089072,0.153084,0.297438
roof_height,0.322886,-0.267895,1.0,-0.018603,-0.17909,0.015911,0.247344,-0.007023,0.091996
ground_elev,-0.091766,0.107822,-0.018603,1.0,0.186144,0.279177,-0.124424,0.016322,-0.014633
xcoord,0.089432,0.130734,-0.17909,0.186144,1.0,0.307375,0.115855,0.043414,0.051407
ycoord,-0.031258,0.124983,0.015911,0.279177,0.307375,1.0,0.122688,-0.020359,0.034407
construction_year,0.229547,-0.089072,0.247344,-0.124424,0.115855,0.122688,1.0,0.03907,0.178527
digitized,0.089102,0.153084,-0.007023,0.016322,0.043414,-0.020359,0.03907,1.0,0.221406
green_roof_area,0.564244,0.297438,0.091996,-0.014633,0.051407,0.034407,0.178527,0.221406,1.0


In [7]:
# encoding area_type and borough to check if there is some correlation between them and the target variable

df['borough'].value_counts()

def encode_borough(x):
    if x =='MN':
        return 1
    elif x =='BK':
        return 2
    elif x =='BX':
        return 3
    elif x =='QN':
        return 4
    elif x =='SI':
        return 5
    else:
        return x

df['borough'] = df['borough'].apply(encode_borough)

df['borough'].value_counts()


1    463
2    133
3     81
4     49
5      4
Name: borough, dtype: int64

In [8]:

df['area_type'].value_counts()



residential    347
commercial     252
industrial     115
public          16
Name: area_type, dtype: int64

In [9]:
def encode_area(x):
    if x =='residential':
        return 1
    elif x =='commercial':
        return 2
    elif x =='industrial':
        return 3
    elif x =='public':
        return 4
    else:
        return x

df['area_type'] = df['area_type'].apply(encode_area)

df['area_type'].value_counts()

1    347
2    252
3    115
4     16
Name: area_type, dtype: int64

In [11]:
df.corr()

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,borough,construction_year,digitized,area_type,green_roof_area
building_area,1.0,-0.187432,0.322886,-0.091766,0.089432,-0.031258,0.13496,0.229547,0.089102,-0.073419,0.564244
ratio_green_area,-0.187432,1.0,-0.267895,0.107822,0.130734,0.124983,0.154481,-0.089072,0.153084,-0.018858,0.297438
roof_height,0.322886,-0.267895,1.0,-0.018603,-0.17909,0.015911,-0.234035,0.247344,-0.007023,0.031163,0.091996
ground_elev,-0.091766,0.107822,-0.018603,1.0,0.186144,0.279177,0.085004,-0.124424,0.016322,0.023837,-0.014633
xcoord,0.089432,0.130734,-0.17909,0.186144,1.0,0.307375,0.663572,0.115855,0.043414,-0.009878,0.051407
ycoord,-0.031258,0.124983,0.015911,0.279177,0.307375,1.0,0.048892,0.122688,-0.020359,0.051321,0.034407
borough,0.13496,0.154481,-0.234035,0.085004,0.663572,0.048892,1.0,0.102585,0.141999,-0.04349,0.087736
construction_year,0.229547,-0.089072,0.247344,-0.124424,0.115855,0.122688,0.102585,1.0,0.03907,0.046502,0.178527
digitized,0.089102,0.153084,-0.007023,0.016322,0.043414,-0.020359,0.141999,0.03907,1.0,0.026957,0.221406
area_type,-0.073419,-0.018858,0.031163,0.023837,-0.009878,0.051321,-0.04349,0.046502,0.026957,1.0,0.005131


In [None]:
#now transform numericals and check the correlation again (target variable no scalled?)
#try also scale numericals and check the correlation again (target variable no scalled?)
#try also transform and scale and check the correlation
#try also to one hot encode categoricals and check correlation


#first linear model => with all the variables:
    #alternative1: transformed
    #alternative2: scaled
    #alternative3: transformed and scaled

#second linear model => with the most 4 or 5 correlated variables with target one
     #alternative1: transformed
     #alternative2: scaled
     #alternative3: transformed and scaled