In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 
from imblearn.over_sampling import SMOTE
import math

In [2]:
# MAIN_GOAL: Predict if a roof was digitized or not

In [3]:
df = pd.read_csv('../Data/cleaned_data/green_roofs2.csv')

df

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,borough,xcoord,ycoord,area_type
0,971,14057,0.07,1900,59,90,1,BK,-73.93491,40.67389,public
1,696,4463,0.16,1900,13,21,0,MN,-73.99982,40.73481,commercial
2,293,13217,0.02,1900,93,7,0,MN,-74.00906,40.72480,commercial
3,759,4311,0.18,1900,99,21,0,MN,-74.00836,40.71595,commercial
4,7204,35891,0.20,1990,206,10,1,BX,-73.91227,40.81906,commercial
...,...,...,...,...,...,...,...,...,...,...,...
725,1525,6414,0.24,1800,104,6,0,MN,-74.00968,40.72554,residential
726,343,2941,0.12,1990,118,42,1,MN,-73.99311,40.72600,public
727,309,4350,0.07,1990,175,18,1,MN,-74.01044,40.71474,industrial
728,8139,20051,0.41,1960,23,12,0,MN,-73.93708,40.79703,commercial


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   green_roof_area    730 non-null    int64  
 1   building_area      730 non-null    int64  
 2   ratio_green_area   730 non-null    float64
 3   construction_year  730 non-null    int64  
 4   roof_height        730 non-null    int64  
 5   ground_elev        730 non-null    int64  
 6   digitized          730 non-null    int64  
 7   borough            730 non-null    object 
 8   xcoord             730 non-null    float64
 9   ycoord             730 non-null    float64
 10  area_type          730 non-null    object 
dtypes: float64(3), int64(6), object(2)
memory usage: 62.9+ KB


In [5]:
df['construction_year'] = df['construction_year'].apply(str) 

In [6]:
#SECOND MODEL version 1 : with choosen variables with more correlation to target variable; scalled with Standard Scaler

#X y split
X2 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'borough','area_type','digitized'], axis = 1) 
# here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y2 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.2, random_state = 19)

#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer = StandardScaler()
transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf


Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year
0,0.623025,0.270266,0.392065,-0.925846
1,-0.263997,0.147379,-0.836001,1.223556
2,-0.390678,-0.531385,-0.631324,-0.925846
3,0.211978,1.102779,-0.631324,0.507089
4,3.664260,0.936836,2.302392,1.223556
...,...,...,...,...
141,-0.220161,-0.387250,-0.017290,1.223556
142,-0.328799,-0.520384,-0.017290,-0.209379
143,0.182881,0.223487,-0.153742,-0.209379
144,-0.365393,-0.603915,1.142551,-0.925846


In [7]:
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_transf,y_train)

In [8]:
display(X_train_SMOTE)
display(y_train_SMOTE)

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year
0,-0.316003,-0.716081,3.150691,1.063548
1,-0.505105,-0.750036,0.494748,-2.198106
2,-0.495841,-0.382260,-0.917988,-1.033229
3,-0.433170,-0.210634,-0.804969,-1.033229
4,-0.517821,-0.750001,-0.070347,-0.334304
...,...,...,...,...
919,-0.105345,-0.570424,1.319355,0.778655
920,-0.081443,-0.352885,0.181483,1.063548
921,-0.502227,-0.661561,-0.813288,-1.033229
922,-0.172755,0.068800,-0.537948,1.063548


0      0
1      0
2      1
3      0
4      1
      ..
919    1
920    1
921    1
922    1
923    1
Name: digitized, Length: 924, dtype: int64

In [9]:
#KNN CLASSIFIER MODEL

knn = KNeighborsClassifier(n_neighbors=4, p=2)
knn.fit(X_train_SMOTE, y_train_SMOTE)

y_train_pred = knn.predict(X_train_SMOTE)
y_test_pred = knn.predict(X_test_transf)

print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))
print("\n")
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))
print("\n")
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))
print("\n")
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))


The accuracy in the TRAIN set is: 0.874
The accuracy in the TEST  set is: 0.699


The precission in the TRAIN set is: 0.884
The precission in the TEST  set is: 0.388


The recall in the TRAIN set is: 0.861
The recall in the TEST  set is: 0.576


The F1-score for the TRAIN set is 0.87
The F1-score for the TEST set is 0.46


In [11]:
#SECOND MODEL version 2 : with other choosen variables; scalled with MinMax Scaler

#X y split
X3 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'digitized','ratio_green_area'], axis = 1) 
#here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y3 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size = 0.2, random_state = 19)


In [12]:
X_train

Unnamed: 0,green_roof_area,building_area,construction_year,borough,area_type
98,1232,1670,1990,BX,residential
502,191,716,1850,MN,residential
398,242,11049,1900,MN,commercial
594,587,15871,1900,MN,residential
150,121,717,1930,BK,residential
...,...,...,...,...,...
308,80,4862,1990,BK,commercial
19,931,22367,1990,BK,residential
354,235,1370,1900,BK,residential
622,737,13346,1960,MN,residential


In [13]:
X_test.iloc[[100],[2]]

Unnamed: 0,construction_year
211,1990


In [14]:
X_test.iloc[[100],[3]]

Unnamed: 0,borough
211,MN


In [None]:
X_test

In [15]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[2]] = X_test.iloc[[100],[2]].replace('1990','1800')

X_test.iloc[[100],[3]] = X_test.iloc[[100],[3]].replace('MN','SI')

In [16]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat2 = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat2 = X_test.select_dtypes(['object'])

In [17]:
#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf


transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area
0,0.623025,0.270266
1,-0.263997,0.147379
2,-0.390678,-0.531385
3,0.211978,1.102779
4,3.664260,0.936836
...,...,...
141,-0.220161,-0.387250
142,-0.328799,-0.520384
143,0.182881,0.223487
144,-0.365393,-0.603915


In [19]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc2 = encoder.fit_transform(X_train_cat2).toarray()
X_train_cat_enc2 = pd.DataFrame(x_train_cat_enc2, columns = encoder.get_feature_names_out())
X_train_cat_enc2

x_test_cat_enc2 = encoder.fit_transform(X_test_cat2).toarray()
X_test_cat_enc2 = pd.DataFrame(x_test_cat_enc2, columns = encoder.get_feature_names_out())
X_test_cat_enc2


Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
142,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
143,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
144,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
X_train_cat_enc2.shape

(584, 15)

In [21]:
#concat

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc2.reset_index(drop = True, inplace = True)
X_test_cat_enc2.reset_index(drop = True, inplace = True)

X_train_treated5 = pd.concat([X_train_num_transf, X_train_cat_enc2], axis = 1)
X_train_treated5
X_test_treated5 = pd.concat([X_test_num_transf, X_test_cat_enc2], axis = 1)
X_test_treated5

Unnamed: 0,green_roof_area,building_area,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.623025,0.270266,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.263997,0.147379,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.390678,-0.531385,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.211978,1.102779,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.664260,0.936836,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,-0.220161,-0.387250,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
142,-0.328799,-0.520384,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
143,0.182881,0.223487,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
144,-0.365393,-0.603915,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
X_train_cat_enc2.shape

(584, 15)

In [23]:
X_test_cat_enc2

Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
142,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
143,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
144,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
sm = SMOTE(random_state=100,k_neighbors=4)
X_train_SMOTE3,y_train_SMOTE3 = sm.fit_resample(X_train_treated5,y_train)

In [25]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_SMOTE3, y_train_SMOTE3)

y_train_pred = knn.predict(X_train_SMOTE3)
y_test_pred = knn.predict(X_test_treated5)

print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE3, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))
print("\n")
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE3, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))
print("\n")
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE3, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))
print("\n")
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE3,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The accuracy in the TRAIN set is: 0.868
The accuracy in the TEST  set is: 0.658


The precission in the TRAIN set is: 0.866
The precission in the TEST  set is: 0.319


The recall in the TRAIN set is: 0.870
The recall in the TEST  set is: 0.455


The F1-score for the TRAIN set is 0.87
The F1-score for the TEST set is 0.38


In [26]:
#SECOND MODEL version 2 : with other choosen variables;

#X y split
X4 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'digitized','ratio_green_area','construction_year'], axis = 1) 
#here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y4 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size = 0.2, random_state = 19)


In [27]:
X_test.iloc[[100],[2]]

Unnamed: 0,borough
211,MN


In [28]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[2]] = X_test.iloc[[100],[2]].replace('MN','SI')


In [29]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat2 = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat2 = X_test.select_dtypes(['object'])

In [30]:
#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf


transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area
0,0.623025,0.270266
1,-0.263997,0.147379
2,-0.390678,-0.531385
3,0.211978,1.102779
4,3.664260,0.936836
...,...,...
141,-0.220161,-0.387250
142,-0.328799,-0.520384
143,0.182881,0.223487
144,-0.365393,-0.603915


In [31]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc2 = encoder.fit_transform(X_train_cat2).toarray()
X_train_cat_enc2 = pd.DataFrame(x_train_cat_enc2, columns = encoder.get_feature_names_out())
X_train_cat_enc2

x_test_cat_enc2 = encoder.fit_transform(X_test_cat2).toarray()
X_test_cat_enc2 = pd.DataFrame(x_test_cat_enc2, columns = encoder.get_feature_names_out())
X_test_cat_enc2


Unnamed: 0,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
141,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
142,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
143,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
144,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [32]:
#concat

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc2.reset_index(drop = True, inplace = True)
X_test_cat_enc2.reset_index(drop = True, inplace = True)

X_train_treated5 = pd.concat([X_train_num_transf, X_train_cat_enc2], axis = 1)
X_train_treated5
X_test_treated5 = pd.concat([X_test_num_transf, X_test_cat_enc2], axis = 1)
X_test_treated5

Unnamed: 0,green_roof_area,building_area,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.623025,0.270266,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.263997,0.147379,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.390678,-0.531385,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.211978,1.102779,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.664260,0.936836,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
141,-0.220161,-0.387250,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
142,-0.328799,-0.520384,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
143,0.182881,0.223487,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
144,-0.365393,-0.603915,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE4,y_train_SMOTE4 = sm.fit_resample(X_train_treated5,y_train)

In [34]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_SMOTE4, y_train_SMOTE4)

y_train_pred = knn.predict(X_train_SMOTE4)
y_test_pred = knn.predict(X_test_treated5)

print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE4, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))
print("\n")
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE4, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))
print("\n")
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE4, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))
print("\n")
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE4,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The accuracy in the TRAIN set is: 0.863
The accuracy in the TEST  set is: 0.671


The precission in the TRAIN set is: 0.882
The precission in the TEST  set is: 0.326


The recall in the TRAIN set is: 0.838
The recall in the TEST  set is: 0.424


The F1-score for the TRAIN set is 0.86
The F1-score for the TEST set is 0.37


In [None]:
#CONCLUSIONS: With smote to treat the imbalance, we got very significant improves in the recall and f1.
            # Better resulst with KNN but no good enough.
            # We could apply another method to treat the class imbalance but maybe with this data frame
            # we can get no better results because of its size
    
  
    