In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 

import math

In [2]:
# MAIN_GOAL: Predict if a roof was digitized or not

In [3]:
df = pd.read_csv('../Data/cleaned_data/green_roofs3.csv')

df

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,borough,xcoord,ycoord,area_type
0,971,14057,0.07,1900,59,90,1,BK,-73.93491,40.67389,public
1,696,4463,0.16,1900,13,21,0,MN,-73.99982,40.73481,commercial
2,293,13217,0.02,1900,93,7,0,MN,-74.00906,40.72480,commercial
3,759,4311,0.18,1900,99,21,0,MN,-74.00836,40.71595,commercial
4,7204,35891,0.20,1990,206,10,1,BX,-73.91227,40.81906,commercial
...,...,...,...,...,...,...,...,...,...,...,...
705,1525,6414,0.24,1800,104,6,0,MN,-74.00968,40.72554,residential
706,343,2941,0.12,1990,118,42,1,MN,-73.99311,40.72600,public
707,309,4350,0.07,1990,175,18,1,MN,-74.01044,40.71474,industrial
708,8139,20051,0.41,1960,23,12,0,MN,-73.93708,40.79703,commercial


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   green_roof_area    710 non-null    int64  
 1   building_area      710 non-null    int64  
 2   ratio_green_area   710 non-null    float64
 3   construction_year  710 non-null    int64  
 4   roof_height        710 non-null    int64  
 5   ground_elev        710 non-null    int64  
 6   digitized          710 non-null    int64  
 7   borough            710 non-null    object 
 8   xcoord             710 non-null    float64
 9   ycoord             710 non-null    float64
 10  area_type          710 non-null    object 
dtypes: float64(3), int64(6), object(2)
memory usage: 61.1+ KB


In [5]:
df['construction_year'] = df['construction_year'].apply(str) 

In [6]:
#first Logistic binary model => with all the variables
    #alternative1: transformed
    #alternative2: scaled
    #alternative3: transformed and scaled

#second Logistic binary model => with less features
     #alternative1: transformed
     #alternative2: scaled
     #alternative3: transformed and scaled


In [7]:
#FIRST MODEL version 1 : with all the variables; transformed with Box-cox

#X y split
X = df.drop(columns = ['digitized','xcoord', 'ycoord'], axis = 1) # here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 19)



In [8]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[3]] = X_test.iloc[[100],[3]].replace('1930','1800')

X_test.iloc[[100],[6]] = X_test.iloc[[100],[6]].replace('BK','SI')


In [9]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat = X_test.select_dtypes(['object'])

#transforming numericals

transformer = PowerTransformer()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf

transformer = PowerTransformer()
transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev
0,-1.520596,1.461397,-1.758586,1.724104,0.453573
1,-0.302921,-1.142329,1.216653,-0.540593,0.762655
2,-0.044016,-0.979401,1.319272,-0.714269,-0.588024
3,-1.001866,-1.427746,0.661282,-0.624422,-0.430611
4,0.694923,0.293870,0.357254,1.012387,1.141792
...,...,...,...,...,...
137,-2.013464,-0.817672,-1.344788,-0.101207,-0.243742
138,-0.660334,-1.674327,1.586839,-1.371252,0.209979
139,1.645096,1.933489,-0.259393,-1.295353,-1.202839
140,-0.099701,0.364032,-0.818924,0.571275,-0.533653


In [10]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc = encoder.fit_transform(X_train_cat).toarray()
X_train_cat_enc = pd.DataFrame(x_train_cat_enc, columns = encoder.get_feature_names_out())
X_train_cat_enc

x_test_cat_enc = encoder.fit_transform(X_test_cat).toarray()
X_test_cat_enc = pd.DataFrame(x_test_cat_enc, columns = encoder.get_feature_names_out())
X_test_cat_enc



Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
#Concat transformed variables a encoded variables

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc.reset_index(drop = True, inplace = True)
X_test_cat_enc.reset_index(drop = True, inplace = True)

X_train_treated = pd.concat([X_train_num_transf, X_train_cat_enc], axis = 1)
X_train_treated
X_test_treated = pd.concat([X_test_num_transf, X_test_cat_enc], axis = 1)
X_test_treated



Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,-1.520596,1.461397,-1.758586,1.724104,0.453573,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.302921,-1.142329,1.216653,-0.540593,0.762655,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.044016,-0.979401,1.319272,-0.714269,-0.588024,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.001866,-1.427746,0.661282,-0.624422,-0.430611,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.694923,0.293870,0.357254,1.012387,1.141792,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,-2.013464,-0.817672,-1.344788,-0.101207,-0.243742,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,-0.660334,-1.674327,1.586839,-1.371252,0.209979,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,1.645096,1.933489,-0.259393,-1.295353,-1.202839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,-0.099701,0.364032,-0.818924,0.571275,-0.533653,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
X_train_treated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   green_roof_area         568 non-null    float64
 1   building_area           568 non-null    float64
 2   ratio_green_area        568 non-null    float64
 3   roof_height             568 non-null    float64
 4   ground_elev             568 non-null    float64
 5   construction_year_1800  568 non-null    float64
 6   construction_year_1850  568 non-null    float64
 7   construction_year_1900  568 non-null    float64
 8   construction_year_1930  568 non-null    float64
 9   construction_year_1960  568 non-null    float64
 10  construction_year_1990  568 non-null    float64
 11  borough_BK              568 non-null    float64
 12  borough_BX              568 non-null    float64
 13  borough_MN              568 non-null    float64
 14  borough_QN              568 non-null    fl

In [13]:
#Logistic model 

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_treated, y_train)

y_train_pred = classification.predict(X_train_treated)
y_test_pred = classification.predict(X_test_treated)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[442,   5],
       [114,   7]], dtype=int64)

array([[116,   4],
       [ 21,   1]], dtype=int64)

In [14]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.790
The accuracy in the TEST  set is: 0.824


In [15]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.583
The precission in the TEST  set is: 0.200


In [16]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.058
The recall in the TEST  set is: 0.045


In [17]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.11
The F1-score for the TEST set is 0.07


In [18]:
#Very bad results on f1, precision and recall.

In [19]:
#FIRST MODEL version 2: with all the variables; scaled with standard scaler


transformer = StandardScaler()
transformer.fit(X_train_num)
x_train_num_scaled = transformer.transform(X_train_num)
X_train_num_scaled = pd.DataFrame(x_train_num_scaled, columns = X_train_num.columns)
X_train_num_scaled

transformer.fit(X_test_num)
x_test_num_scaled = transformer.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(x_test_num_scaled, columns = X_test_num.columns)
X_test_num_scaled


Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev
0,-0.624342,1.516453,-0.960217,2.280644,0.165824
1,-0.482557,-0.676631,0.786378,-0.620820,0.548701
2,-0.409202,-0.655677,0.961037,-0.692022,-0.695650
3,-0.589306,-0.703542,0.145960,-0.656421,-0.599931
4,0.031205,-0.193345,-0.086920,0.749810,1.123016
...,...,...,...,...,...
137,-0.641859,-0.629714,-0.843777,-0.389415,-0.472305
138,-0.549344,-0.719316,1.601455,-0.887826,-0.089428
139,2.082151,3.063293,-0.436239,-0.870025,-0.982808
140,-0.427267,-0.141113,-0.669118,0.171297,-0.663743


In [20]:
#concat cat_encoded with num_scaled

X_train_num_scaled.reset_index(drop = True, inplace = True)
X_test_num_scaled.reset_index(drop = True, inplace = True)


X_train_treated2 = pd.concat([X_train_num_scaled, X_train_cat_enc], axis = 1)
X_train_treated2
X_test_treated2 = pd.concat([X_test_num_scaled, X_test_cat_enc], axis = 1)
X_test_treated2


Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,-0.624342,1.516453,-0.960217,2.280644,0.165824,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.482557,-0.676631,0.786378,-0.620820,0.548701,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.409202,-0.655677,0.961037,-0.692022,-0.695650,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.589306,-0.703542,0.145960,-0.656421,-0.599931,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.031205,-0.193345,-0.086920,0.749810,1.123016,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,-0.641859,-0.629714,-0.843777,-0.389415,-0.472305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,-0.549344,-0.719316,1.601455,-0.887826,-0.089428,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,2.082151,3.063293,-0.436239,-0.870025,-0.982808,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,-0.427267,-0.141113,-0.669118,0.171297,-0.663743,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
#Logistic model 

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_treated2, y_train)

y_train_pred = classification.predict(X_train_treated2)
y_test_pred = classification.predict(X_test_treated2)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[437,  10],
       [113,   8]], dtype=int64)

array([[116,   4],
       [ 20,   2]], dtype=int64)

In [22]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.783
The accuracy in the TEST  set is: 0.831


In [23]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.444
The precission in the TEST  set is: 0.333


In [24]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.066
The recall in the TEST  set is: 0.091


In [25]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.12
The F1-score for the TEST set is 0.14


In [26]:
#FIRST MODEL version 3: with all the variables; transformed with log

#transformer

def log_transform(x):
    x = np.log10(x) 
    if np.isfinite(x):
        return x
    else:
        return 0

cols = list(X_train_num.columns)
X_train_num_log = pd.DataFrame()
X_test_num_log = pd.DataFrame()
for i in range(len(cols)):
    X_train_num_log[cols[i]] = X_train_num[cols[i]].apply(log_transform)
    
for i in range(len(cols)):
    X_test_num_log[cols[i]] = X_test_num[cols[i]].apply(log_transform)
    

X_train_num_log
X_test_num_log

Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev
564,2.158362,4.826003,0.000000,2.589950,1.681241
378,2.820858,3.342817,-0.522879,1.799341,1.778151
302,2.968483,3.450403,-0.481486,1.740363,1.322219
100,2.434569,3.148294,-0.721246,1.770852,1.380211
343,3.404663,4.216931,-0.823909,2.336460,1.892095
...,...,...,...,...,...
445,1.903090,3.554852,-1.698970,1.949390,1.447158
563,2.621176,2.973590,-0.356547,1.518514,1.602060
395,4.001388,5.051866,-1.045757,1.544068,1.079181
592,2.936514,4.255803,-1.301030,2.181844,1.342423


In [27]:
X_train_num_log

Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev
161,3.324694,4.256862,-0.920819,1.968483,1.491362
104,3.781253,3.984707,-0.200659,1.838849,1.146128
205,2.330414,4.393961,-2.000000,2.521138,1.568202
456,2.755875,3.891203,-1.154902,2.184691,1.146128
129,3.947189,4.645874,-0.698970,2.472756,1.908485
...,...,...,...,...,...
308,3.644636,4.552838,-0.920819,1.176091,1.176091
19,3.448242,4.121954,-0.677781,1.591065,1.113943
354,3.487280,4.459302,-0.958607,2.531479,1.397940
622,2.592177,3.901622,-1.301030,2.152288,0.778151


In [28]:
X_train_cat_enc

Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
564,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
565,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
566,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [29]:
X_test_cat_enc

Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [30]:
681-142

539

In [31]:
#concat cat_encoded with num_scaled

X_train_num_log.reset_index(drop = True, inplace = True)
X_test_num_log.reset_index(drop = True, inplace = True)

X_train_treated3 = pd.concat([X_train_num_log, X_train_cat_enc], axis = 1)
X_train_treated3
X_test_treated3 = pd.concat([X_test_num_log, X_test_cat_enc], axis = 1)
X_test_treated3


Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,2.158362,4.826003,0.000000,2.589950,1.681241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.820858,3.342817,-0.522879,1.799341,1.778151,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.968483,3.450403,-0.481486,1.740363,1.322219,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.434569,3.148294,-0.721246,1.770852,1.380211,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3.404663,4.216931,-0.823909,2.336460,1.892095,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,1.903090,3.554852,-1.698970,1.949390,1.447158,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,2.621176,2.973590,-0.356547,1.518514,1.602060,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,4.001388,5.051866,-1.045757,1.544068,1.079181,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,2.936514,4.255803,-1.301030,2.181844,1.342423,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [32]:
classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_treated3, y_train)

y_train_pred = classification.predict(X_train_treated3)
y_test_pred = classification.predict(X_test_treated3)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[441,   6],
       [115,   6]], dtype=int64)

array([[117,   3],
       [ 21,   1]], dtype=int64)

In [33]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.787
The accuracy in the TEST  set is: 0.831


In [34]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.500
The precission in the TEST  set is: 0.250


In [35]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.050
The recall in the TEST  set is: 0.045


In [36]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.09
The F1-score for the TEST set is 0.08


In [37]:
#FIRST MODEL version 4: with all the variables; scaled with MinMax scaler

transformer = MinMaxScaler()
transformer.fit(X_train_num)
x_train_num_scaled2 = transformer.transform(X_train_num)
X_train_num_scaled2 = pd.DataFrame(x_train_num_scaled2, columns = X_train_num.columns)
X_train_num_scaled2

transformer.fit(X_test_num)
x_test_num_scaled2 = transformer.transform(X_test_num)
X_test_num_scaled2 = pd.DataFrame(x_test_num_scaled2, columns = X_test_num.columns)
X_test_num_scaled2

Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev
0,0.002686,0.407565,0.0000,0.651724,0.299320
1,0.021747,0.010302,0.3750,0.089655,0.380952
2,0.031609,0.014097,0.4125,0.075862,0.115646
3,0.007396,0.005427,0.2375,0.082759,0.136054
4,0.090815,0.097846,0.1875,0.355172,0.503401
...,...,...,...,...,...
137,0.000331,0.018800,0.0250,0.134483,0.163265
138,0.012769,0.002569,0.5500,0.037931,0.244898
139,0.366537,0.687766,0.1125,0.041379,0.054422
140,0.029180,0.107307,0.0625,0.243103,0.122449


In [38]:
#concat cat_encoded with num_scaled2

X_train_num_scaled2.reset_index(drop = True, inplace = True)
X_test_num_scaled2.reset_index(drop = True, inplace = True)

X_train_treated4 = pd.concat([X_train_num_scaled2, X_train_cat_enc], axis = 1)
X_train_treated4
X_test_treated4 = pd.concat([X_test_num_scaled2, X_test_cat_enc], axis = 1)
X_test_treated4

Unnamed: 0,green_roof_area,building_area,ratio_green_area,roof_height,ground_elev,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.002686,0.407565,0.0000,0.651724,0.299320,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.021747,0.010302,0.3750,0.089655,0.380952,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.031609,0.014097,0.4125,0.075862,0.115646,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.007396,0.005427,0.2375,0.082759,0.136054,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.090815,0.097846,0.1875,0.355172,0.503401,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.000331,0.018800,0.0250,0.134483,0.163265,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.012769,0.002569,0.5500,0.037931,0.244898,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.366537,0.687766,0.1125,0.041379,0.054422,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.029180,0.107307,0.0625,0.243103,0.122449,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_treated4, y_train)

y_train_pred = classification.predict(X_train_treated4)
y_test_pred = classification.predict(X_test_treated4)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[440,   7],
       [115,   6]], dtype=int64)

array([[117,   3],
       [ 20,   2]], dtype=int64)

In [40]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.785
The accuracy in the TEST  set is: 0.838


In [41]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.462
The precission in the TEST  set is: 0.400


In [42]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.050
The recall in the TEST  set is: 0.091


In [43]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.09
The F1-score for the TEST set is 0.15


In [44]:
#SECOND MODEL version 1 : with choosen variables with more correlation to target variable; scalled with Standard Scaler

#X y split
X2 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'borough','area_type','digitized'], axis = 1) 
# here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y2 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.2, random_state = 19)

#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer = StandardScaler()
transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf


Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year
0,-0.624342,1.516453,-0.960217,-0.869500
1,-0.482557,-0.676631,0.786378,-1.991946
2,-0.409202,-0.655677,0.961037,-0.869500
3,-0.589306,-0.703542,0.145960,1.150902
4,0.031205,-0.193345,-0.086920,1.150902
...,...,...,...,...
137,-0.641859,-0.629714,-0.843777,-0.869500
138,-0.549344,-0.719316,1.601455,-0.869500
139,2.082151,3.063293,-0.436239,0.477435
140,-0.427267,-0.141113,-0.669118,1.150902


In [45]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_transf, y_train)

y_train_pred = classification.predict(X_train_transf)
y_test_pred = classification.predict(X_test_transf)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[442,   5],
       [116,   5]], dtype=int64)

array([[119,   1],
       [ 20,   2]], dtype=int64)

In [46]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.787
The accuracy in the TEST  set is: 0.852


In [47]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.500
The precission in the TEST  set is: 0.667


In [48]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.041
The recall in the TEST  set is: 0.091


In [49]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.08
The F1-score for the TEST set is 0.16


In [50]:
#SECOND MODEL version 2 : with other choosen variables; scalled with MinMax Scaler

#X y split
X3 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'digitized','ratio_green_area'], axis = 1) 
#here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y3 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size = 0.2, random_state = 19)


In [51]:
X_train

Unnamed: 0,green_roof_area,building_area,construction_year,borough,area_type
161,2112,18066,1990,BX,industrial
104,6043,9654,1990,MN,residential
205,214,24772,1960,MN,commercial
456,570,7784,1960,MN,commercial
129,8855,44246,1960,BK,residential
...,...,...,...,...,...
308,4412,35714,1990,BK,residential
19,2807,13242,1930,BK,industrial
354,3071,28794,1990,MN,industrial
622,391,7973,1960,MN,commercial


In [54]:
X_test.iloc[[100],[2]]

Unnamed: 0,construction_year
276,1930


In [53]:
X_test.iloc[[100],[3]]

Unnamed: 0,borough
276,BK


In [55]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[2]] = X_test.iloc[[100],[2]].replace('1930','1800')

X_test.iloc[[100],[3]] = X_test.iloc[[100],[3]].replace('BK','SI')

In [56]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat2 = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat2 = X_test.select_dtypes(['object'])

In [57]:
#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = MinMaxScaler()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf

transformer = MinMaxScaler()
transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area
0,0.002686,0.407565
1,0.021747,0.010302
2,0.031609,0.014097
3,0.007396,0.005427
4,0.090815,0.097846
...,...,...
137,0.000331,0.018800
138,0.012769,0.002569
139,0.366537,0.687766
140,0.029180,0.107307


In [58]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc2 = encoder.fit_transform(X_train_cat2).toarray()
X_train_cat_enc2 = pd.DataFrame(x_train_cat_enc2, columns = encoder.get_feature_names_out())
X_train_cat_enc2

x_test_cat_enc2 = encoder.fit_transform(X_test_cat2).toarray()
X_test_cat_enc2 = pd.DataFrame(x_test_cat_enc2, columns = encoder.get_feature_names_out())
X_test_cat_enc2


Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
X_train_cat_enc2.shape

In [59]:
#concat

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc2.reset_index(drop = True, inplace = True)
X_test_cat_enc2.reset_index(drop = True, inplace = True)

X_train_treated5 = pd.concat([X_train_num_transf, X_train_cat_enc2], axis = 1)
X_train_treated5
X_test_treated5 = pd.concat([X_test_num_transf, X_test_cat_enc2], axis = 1)
X_test_treated5

Unnamed: 0,green_roof_area,building_area,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.002686,0.407565,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.021747,0.010302,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.031609,0.014097,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.007396,0.005427,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.090815,0.097846,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.000331,0.018800,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.012769,0.002569,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.366537,0.687766,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.029180,0.107307,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_treated5, y_train)

y_train_pred = classification.predict(X_train_treated5)
y_test_pred = classification.predict(X_test_treated5)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[442,   5],
       [115,   6]], dtype=int64)

array([[117,   3],
       [ 20,   2]], dtype=int64)

In [None]:
#CONCLUSIONS: After several modifications on the features selecting, used transformers, the results still are very bad.
                # probably what is causing this is class imbalance or the size of data set is not enough
                    # to the model performs good
    
  
    
# NEXT_STEPS:
            #Deal with class imbalance
            #Use another mother like KNN classifier
    