In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
import math

In [2]:
# MAIN_GOAL: Predict green_roof_area with KNN

In [3]:
df = pd.read_csv('../Data/cleaned_data/green_roofs3.csv')

df

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,borough,xcoord,ycoord,area_type
0,971,14057,0.07,1900,59,90,1,BK,-73.93491,40.67389,public
1,696,4463,0.16,1900,13,21,0,MN,-73.99982,40.73481,commercial
2,293,13217,0.02,1900,93,7,0,MN,-74.00906,40.72480,commercial
3,759,4311,0.18,1900,99,21,0,MN,-74.00836,40.71595,commercial
4,7204,35891,0.20,1990,206,10,1,BX,-73.91227,40.81906,commercial
...,...,...,...,...,...,...,...,...,...,...,...
705,1525,6414,0.24,1800,104,6,0,MN,-74.00968,40.72554,residential
706,343,2941,0.12,1990,118,42,1,MN,-73.99311,40.72600,public
707,309,4350,0.07,1990,175,18,1,MN,-74.01044,40.71474,industrial
708,8139,20051,0.41,1960,23,12,0,MN,-73.93708,40.79703,commercial


In [4]:
df.describe()

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,xcoord,ycoord
count,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0
mean,2515.178873,21041.776056,0.173028,1942.802817,137.904225,40.660563,0.201408,-73.966258,40.745384
std,3665.914314,27257.680419,0.164256,43.075071,130.209084,31.682424,0.401335,0.045733,0.052617
min,10.0,522.0,0.0,1800.0,8.0,-4.0,0.0,-74.07732,40.57222
25%,422.5,4269.75,0.06,1900.0,51.0,15.0,0.0,-73.99605,40.717375
50%,1124.0,11731.0,0.12,1930.0,85.0,32.0,0.0,-73.978575,40.74133
75%,2840.75,26406.75,0.22,1990.0,183.0,61.0,0.0,-73.950807,40.773702
max,28669.0,178941.0,0.83,1990.0,755.0,157.0,1.0,-73.75132,40.89442


In [5]:
df.columns

Index(['green_roof_area', 'building_area', 'ratio_green_area',
       'construction_year', 'roof_height', 'ground_elev', 'digitized',
       'borough', 'xcoord', 'ycoord', 'area_type'],
      dtype='object')

In [6]:
# to check the correlation we may want to order the columns so the target variable is at the end, and
    # the numericals and categoricals are together displayed

df = df[['building_area', 'ratio_green_area','roof_height', 'ground_elev',\
    'xcoord', 'ycoord','borough','construction_year','digitized','area_type', 'green_roof_area' ]]

df

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,borough,construction_year,digitized,area_type,green_roof_area
0,14057,0.07,59,90,-73.93491,40.67389,BK,1900,1,public,971
1,4463,0.16,13,21,-73.99982,40.73481,MN,1900,0,commercial,696
2,13217,0.02,93,7,-74.00906,40.72480,MN,1900,0,commercial,293
3,4311,0.18,99,21,-74.00836,40.71595,MN,1900,0,commercial,759
4,35891,0.20,206,10,-73.91227,40.81906,BX,1990,1,commercial,7204
...,...,...,...,...,...,...,...,...,...,...,...
705,6414,0.24,104,6,-74.00968,40.72554,MN,1800,0,residential,1525
706,2941,0.12,118,42,-73.99311,40.72600,MN,1990,1,public,343
707,4350,0.07,175,18,-74.01044,40.71474,MN,1990,1,industrial,309
708,20051,0.41,23,12,-73.93708,40.79703,MN,1960,0,commercial,8139


In [7]:
# first we want to check the correlation

df.corr()

  df.corr()


Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,construction_year,digitized,green_roof_area
building_area,1.0,-0.253218,0.3544,-0.095677,0.092302,-0.042756,0.23129,0.009975,0.507824
ratio_green_area,-0.253218,1.0,-0.286326,0.090179,0.143472,0.125715,-0.100057,0.152914,0.300224
roof_height,0.3544,-0.286326,1.0,-0.014516,-0.183468,0.020556,0.26898,-0.045999,0.143411
ground_elev,-0.095677,0.090179,-0.014516,1.0,0.173366,0.247264,-0.129192,0.002722,-0.043563
xcoord,0.092302,0.143472,-0.183468,0.173366,1.0,0.287634,0.116766,0.040407,0.117315
ycoord,-0.042756,0.125715,0.020556,0.247264,0.287634,1.0,0.12372,-0.027729,0.018112
construction_year,0.23129,-0.100057,0.26898,-0.129192,0.116766,0.12372,1.0,0.034201,0.232321
digitized,0.009975,0.152914,-0.045999,0.002722,0.040407,-0.027729,0.034201,1.0,0.193291
green_roof_area,0.507824,0.300224,0.143411,-0.043563,0.117315,0.018112,0.232321,0.193291,1.0


In [8]:
# encoding area_type and borough to check if there is some correlation between them and the target variable

df['borough'].value_counts()

def encode_borough(x):
    if x =='MN':
        return 1
    elif x =='BK':
        return 2
    elif x =='BX':
        return 3
    elif x =='QN':
        return 4
    elif x =='SI':
        return 5
    else:
        return x

df['borough'] = df['borough'].apply(encode_borough)

df['borough'].value_counts()


1    452
2    129
3     77
4     48
5      4
Name: borough, dtype: int64

In [9]:

df['area_type'].value_counts()



residential    340
commercial     243
industrial     112
public          15
Name: area_type, dtype: int64

In [10]:
def encode_area(x):
    if x =='residential':
        return 1
    elif x =='commercial':
        return 2
    elif x =='industrial':
        return 3
    elif x =='public':
        return 4
    else:
        return x

df['area_type'] = df['area_type'].apply(encode_area)

df['area_type'].value_counts()

1    340
2    243
3    112
4     15
Name: area_type, dtype: int64

In [11]:
df.corr()

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,borough,construction_year,digitized,area_type,green_roof_area
building_area,1.0,-0.253218,0.3544,-0.095677,0.092302,-0.042756,0.144644,0.23129,0.009975,-0.067773,0.507824
ratio_green_area,-0.253218,1.0,-0.286326,0.090179,0.143472,0.125715,0.157655,-0.100057,0.152914,-0.036913,0.300224
roof_height,0.3544,-0.286326,1.0,-0.014516,-0.183468,0.020556,-0.239615,0.26898,-0.045999,0.020635,0.143411
ground_elev,-0.095677,0.090179,-0.014516,1.0,0.173366,0.247264,0.063348,-0.129192,0.002722,0.010637,-0.043563
xcoord,0.092302,0.143472,-0.183468,0.173366,1.0,0.287634,0.661047,0.116766,0.040407,-0.017041,0.117315
ycoord,-0.042756,0.125715,0.020556,0.247264,0.287634,1.0,0.034088,0.12372,-0.027729,0.034849,0.018112
borough,0.144644,0.157655,-0.239615,0.063348,0.661047,0.034088,1.0,0.105763,0.141607,-0.044662,0.168842
construction_year,0.23129,-0.100057,0.26898,-0.129192,0.116766,0.12372,0.105763,1.0,0.034201,0.046707,0.232321
digitized,0.009975,0.152914,-0.045999,0.002722,0.040407,-0.027729,0.141607,0.034201,1.0,0.021362,0.193291
area_type,-0.067773,-0.036913,0.020635,0.010637,-0.017041,0.034849,-0.044662,0.046707,0.021362,1.0,0.005466


In [12]:
#CONCLUSION: We have very low correlation between features and target variables.

# This time we'll try KNN model

In [13]:
#first KNN model => with all the variables:
    #alternative1: transformed
    #alternative2: scaled
    #alternative3: transformed and scaled

#second KNN model => with the most 4 or 5 correlated variables with target one
     #alternative1: transformed
     #alternative2: scaled
     #alternative3: transformed and scaled

#third KNN model => from first and second models, we choose the better one,
                        #then we change the parameters of the model
    #alternative1: changing the number of neighbors
    #alternative2: changing the used distance
    #alternative3: changing weights
    
#fourth KNN model => the same as third but this time we research the best K
    




In [14]:
#FIRST MODEL version 1 : with all the variables; transformed with Box-cox

#X y split
X = df.drop(columns = ['green_roof_area','xcoord', 'ycoord'], axis = 1) # here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y = df['green_roof_area']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 19)

#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = PowerTransformer()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer = PowerTransformer()
transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf

 #We get negativae values like in building_area and roof_height. This might not make sense, maybe this transformation don't work here


Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,borough,construction_year,digitized,area_type
0,1.461397,-1.758586,1.724104,0.453573,-0.708570,-0.897217,-0.428174,-1.021177
1,-1.142329,1.216653,-0.540593,0.762655,-0.708570,-1.816605,-0.428174,-1.021177
2,-0.979401,1.319272,-0.714269,-0.588024,-0.708570,-0.897217,-0.428174,-1.021177
3,-1.427746,0.661282,-0.624422,-0.430611,1.532743,1.191149,-0.428174,1.365140
4,0.293870,0.357254,1.012387,1.141792,-0.708570,1.191149,-0.428174,-1.021177
...,...,...,...,...,...,...,...,...
137,-0.817672,-1.344788,-0.101207,-0.243742,-0.708570,-0.897217,-0.428174,-1.021177
138,-1.674327,1.586839,-1.371252,0.209979,1.653465,-0.897217,-0.428174,0.593257
139,1.933489,-0.259393,-1.295353,-1.202839,1.532743,0.426665,2.335497,-1.021177
140,0.364032,-0.818924,0.571275,-0.533653,1.150252,1.191149,-0.428174,-1.021177


In [15]:
#KNN model 

knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train_transf, y_train)

y_pred_train = knn.predict(X_train_transf)
y_pred_test = knn.predict(X_test_transf)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

knn_score = knn.score(X_test_transf, y_test)
print('knn_test score=', knn_score)

r2_score_train = 0.8408415786497311
r2_score_test = 0.38943505535119793
knn_test score= 0.38943505535119793


In [16]:
#FIRST MODEL version 2: with all the variables; scaled with standard scaler

transformer = StandardScaler()
transformer.fit(X_train)
x_train_scaled = transformer.transform(X_train)
X_train_scaled = pd.DataFrame(x_train_scaled, columns = X_train.columns)
X_train_scaled

transformer = StandardScaler()
transformer.fit(X_test)
x_test_scaled = transformer.transform(X_test)
X_test_scaled = pd.DataFrame(x_test_scaled, columns = X_test.columns)
X_test_scaled



Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,borough,construction_year,digitized,area_type
0,1.516453,-0.960217,2.280644,0.165824,-0.623647,-0.869500,-0.428174,-0.905318
1,-0.676631,0.786378,-0.620820,0.548701,-0.623647,-1.991946,-0.428174,-0.905318
2,-0.655677,0.961037,-0.692022,-0.695650,-0.623647,-0.869500,-0.428174,-0.905318
3,-0.703542,0.145960,-0.656421,-0.599931,1.435839,1.150902,-0.428174,1.497582
4,-0.193345,-0.086920,0.749810,1.123016,-0.623647,1.150902,-0.428174,-0.905318
...,...,...,...,...,...,...,...,...
137,-0.629714,-0.843777,-0.389415,-0.472305,-0.623647,-0.869500,-0.428174,-0.905318
138,-0.719316,1.601455,-0.887826,-0.089428,2.465581,-0.869500,-0.428174,0.296132
139,3.063293,-0.436239,-0.870025,-0.982808,1.435839,0.477435,2.335497,-0.905318
140,-0.141113,-0.669118,0.171297,-0.663743,0.406096,1.150902,-0.428174,-0.905318


In [17]:
#KNN Model with n_neighbors = 2

knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train_scaled, y_train)

y_pred_train = knn.predict(X_train_scaled)
y_pred_test = knn.predict(X_test_scaled)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = math.sqrt(mse_train)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = math.sqrt(mse_test)

print('rmse_train =', rmse_train)
print('rmse_test =', rmse_test)

r2_score_train = 0.834840098994776
r2_score_test = 0.21633912770430808
rmse_train = 1489.635924331801
rmse_test = 3234.1917266915393


In [18]:
#KNN Model n_neighbors = 1

knn = KNeighborsRegressor(n_neighbors=1)
knn.fit(X_train_scaled, y_train)

y_pred_train = knn.predict(X_train_scaled)
y_pred_test = knn.predict(X_test_scaled)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = math.sqrt(mse_train)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = math.sqrt(mse_test)


print('rmse_train =', rmse_train)
print('rmse_test =', rmse_test)

r2_score_train = 1.0
r2_score_test = 0.14631649235212996
rmse_train = 0.0
rmse_test = 3375.593375067977


In [19]:
#FIRST MODEL version 3: with all the variables; transformed with log

#transformer

def log_transform(x):
    x = np.log10(x) 
    if np.isfinite(x):
        return x
    else:
        return 0

cols = list(X_train.columns)
X_train_log = pd.DataFrame()
X_test_log = pd.DataFrame()
for i in range(len(cols)):
    X_train_log[cols[i]] = X_train[cols[i]].apply(log_transform)
    
for i in range(len(cols)):
    X_test_log[cols[i]] = X_test[cols[i]].apply(log_transform)
    

X_train_log
X_test_log

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,borough,construction_year,digitized,area_type
564,4.826003,0.000000,2.589950,1.681241,0.000000,3.278754,0.0,0.000000
378,3.342817,-0.522879,1.799341,1.778151,0.000000,3.267172,0.0,0.000000
302,3.450403,-0.481486,1.740363,1.322219,0.000000,3.278754,0.0,0.000000
100,3.148294,-0.721246,1.770852,1.380211,0.477121,3.298853,0.0,0.477121
343,4.216931,-0.823909,2.336460,1.892095,0.000000,3.298853,0.0,0.000000
...,...,...,...,...,...,...,...,...
445,3.554852,-1.698970,1.949390,1.447158,0.000000,3.278754,0.0,0.000000
563,2.973590,-0.356547,1.518514,1.602060,0.602060,3.278754,0.0,0.301030
395,5.051866,-1.045757,1.544068,1.079181,0.477121,3.292256,0.0,0.000000
592,4.255803,-1.301030,2.181844,1.342423,0.301030,3.298853,0.0,0.000000


In [20]:
#KNN Model n_neighbors = 11

knn = KNeighborsRegressor(n_neighbors=11)
knn.fit(X_train_log, y_train)

y_pred_train = knn.predict(X_train_log)
y_pred_test = knn.predict(X_test_log)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = math.sqrt(mse_train)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = math.sqrt(mse_test)


print('rmse_train =', rmse_train)
print('rmse_test =', rmse_test)

r2_score_train = 0.7186932886398405
r2_score_test = 0.6016385814937261
rmse_train = 1944.0967629712393
rmse_test = 2305.899182136871


In [21]:
#FIRST MODEL version 4: with all the variables; scaled with MinMax scaler

transformer = MinMaxScaler()
transformer.fit(X_train)
x_train_scaled2 = transformer.transform(X_train)
X_train_scaled2 = pd.DataFrame(x_train_scaled2, columns = X_train.columns)
X_train_scaled2

transformer = MinMaxScaler()
transformer.fit(X_test)
x_test_scaled2 = transformer.transform(X_test)
X_test_scaled2 = pd.DataFrame(x_test_scaled2, columns = X_test.columns)
X_test_scaled2


Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,borough,construction_year,digitized,area_type
0,0.407565,0.0000,0.651724,0.299320,0.000000,0.357143,0.0,0.000000
1,0.010302,0.3750,0.089655,0.380952,0.000000,0.000000,0.0,0.000000
2,0.014097,0.4125,0.075862,0.115646,0.000000,0.357143,0.0,0.000000
3,0.005427,0.2375,0.082759,0.136054,0.666667,1.000000,0.0,0.666667
4,0.097846,0.1875,0.355172,0.503401,0.000000,1.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...
137,0.018800,0.0250,0.134483,0.163265,0.000000,0.357143,0.0,0.000000
138,0.002569,0.5500,0.037931,0.244898,1.000000,0.357143,0.0,0.333333
139,0.687766,0.1125,0.041379,0.054422,0.666667,0.785714,1.0,0.000000
140,0.107307,0.0625,0.243103,0.122449,0.333333,1.000000,0.0,0.000000


In [22]:
#KNN Model n_neighbors = 3

knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train_scaled2, y_train)

y_pred_train = knn.predict(X_train_scaled2)
y_pred_test = knn.predict(X_test_scaled2)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = math.sqrt(mse_train)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = math.sqrt(mse_test)


print('rmse_train =', rmse_train)
print('rmse_test =', rmse_test)

r2_score_train = 0.5497079842377404
r2_score_test = 0.24542453206428883
rmse_train = 2459.6592923529342
rmse_test = 3173.6060914100626


In [23]:
#SECOND MODEL version 1 : with choosen variables with more correlation to target variable; scalled with Standard Scaler

#X y split
X2 = df.drop(columns = ['green_roof_area','xcoord', 'ycoord','roof_height','ground_elev',\
                       'borough','area_type'], axis = 1) 
# here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y2 = df['green_roof_area']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.2, random_state = 19)

#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer = StandardScaler()
transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf


Unnamed: 0,building_area,ratio_green_area,construction_year,digitized
0,1.516453,-0.960217,-0.869500,-0.428174
1,-0.676631,0.786378,-1.991946,-0.428174
2,-0.655677,0.961037,-0.869500,-0.428174
3,-0.703542,0.145960,1.150902,-0.428174
4,-0.193345,-0.086920,1.150902,-0.428174
...,...,...,...,...
137,-0.629714,-0.843777,-0.869500,-0.428174
138,-0.719316,1.601455,-0.869500,-0.428174
139,3.063293,-0.436239,0.477435,2.335497
140,-0.141113,-0.669118,1.150902,-0.428174


In [24]:
knn = KNeighborsRegressor(n_neighbors=1)
knn.fit(X_train_transf, y_train)

y_pred_train = knn.predict(X_train_transf)
y_pred_test = knn.predict(X_test_transf)

print('r2_score_train =', r2_score(y_train, y_pred_train))
print('r2_score_test =', r2_score(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = math.sqrt(mse_train)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = math.sqrt(mse_test)


print('rmse_train =', rmse_train)
print('rmse_test =', rmse_test)

r2_score_train = 1.0
r2_score_test = 0.7403400251072894
rmse_train = 0.0
rmse_test = 1861.6763667099062


In [25]:
df.describe()

Unnamed: 0,building_area,ratio_green_area,roof_height,ground_elev,xcoord,ycoord,borough,construction_year,digitized,area_type,green_roof_area
count,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0,710.0
mean,21041.776056,0.173028,137.904225,40.660563,-73.966258,40.745384,1.623944,1942.802817,0.201408,1.721127,2515.178873
std,27257.680419,0.164256,130.209084,31.682424,0.045733,0.052617,0.962332,43.075071,0.401335,0.802661,3665.914314
min,522.0,0.0,8.0,-4.0,-74.07732,40.57222,1.0,1800.0,0.0,1.0,10.0
25%,4269.75,0.06,51.0,15.0,-73.99605,40.717375,1.0,1900.0,0.0,1.0,422.5
50%,11731.0,0.12,85.0,32.0,-73.978575,40.74133,1.0,1930.0,0.0,2.0,1124.0
75%,26406.75,0.22,183.0,61.0,-73.950807,40.773702,2.0,1990.0,0.0,2.0,2840.75
max,178941.0,0.83,755.0,157.0,-73.75132,40.89442,5.0,1990.0,1.0,4.0,28669.0


In [26]:
#CONCLUSIONS:
    # KNN perfoms a lot better than the linear model
    # By removing outliers, this turned the knn results worse
    # it seems that the data was very sensitive by removing rows
    
# NEXT_STEPS:
    # instead of remove outliers try to replace them by the closest whisker
    # changing some parameter on knn function??
    #use cross validation?
    #it's very likely that the data set is very little and therefore we cannot prevent overfitting unless we get more data