In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 
from imblearn.over_sampling import SMOTE
import math

In [2]:
# MAIN_GOAL: Predict if a roof was digitized or not

In [3]:
df = pd.read_csv('../Data/cleaned_data/green_roofs3.csv')

df

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year,roof_height,ground_elev,digitized,borough,xcoord,ycoord,area_type
0,971,14057,0.07,1900,59,90,1,BK,-73.93491,40.67389,public
1,696,4463,0.16,1900,13,21,0,MN,-73.99982,40.73481,commercial
2,293,13217,0.02,1900,93,7,0,MN,-74.00906,40.72480,commercial
3,759,4311,0.18,1900,99,21,0,MN,-74.00836,40.71595,commercial
4,7204,35891,0.20,1990,206,10,1,BX,-73.91227,40.81906,commercial
...,...,...,...,...,...,...,...,...,...,...,...
705,1525,6414,0.24,1800,104,6,0,MN,-74.00968,40.72554,residential
706,343,2941,0.12,1990,118,42,1,MN,-73.99311,40.72600,public
707,309,4350,0.07,1990,175,18,1,MN,-74.01044,40.71474,industrial
708,8139,20051,0.41,1960,23,12,0,MN,-73.93708,40.79703,commercial


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   green_roof_area    710 non-null    int64  
 1   building_area      710 non-null    int64  
 2   ratio_green_area   710 non-null    float64
 3   construction_year  710 non-null    int64  
 4   roof_height        710 non-null    int64  
 5   ground_elev        710 non-null    int64  
 6   digitized          710 non-null    int64  
 7   borough            710 non-null    object 
 8   xcoord             710 non-null    float64
 9   ycoord             710 non-null    float64
 10  area_type          710 non-null    object 
dtypes: float64(3), int64(6), object(2)
memory usage: 61.1+ KB


In [5]:
df['construction_year'] = df['construction_year'].apply(str) 

In [6]:
#SECOND MODEL version 1 : with choosen variables with more correlation to target variable; scalled with Standard Scaler

#X y split
X2 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'borough','area_type','digitized'], axis = 1) 
# here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y2 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.2, random_state = 19)

#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer = StandardScaler()
transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf


Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year
0,-0.624342,1.516453,-0.960217,-0.869500
1,-0.482557,-0.676631,0.786378,-1.991946
2,-0.409202,-0.655677,0.961037,-0.869500
3,-0.589306,-0.703542,0.145960,1.150902
4,0.031205,-0.193345,-0.086920,1.150902
...,...,...,...,...
137,-0.641859,-0.629714,-0.843777,-0.869500
138,-0.549344,-0.719316,1.601455,-0.869500
139,2.082151,3.063293,-0.436239,0.477435
140,-0.427267,-0.141113,-0.669118,1.150902


In [7]:
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_transf,y_train)

In [8]:
display(X_train_SMOTE)
display(y_train_SMOTE)

Unnamed: 0,green_roof_area,building_area,ratio_green_area,construction_year
0,-0.116145,-0.100983,-0.339591,1.084014
1,0.956300,-0.416940,2.806320,1.084014
2,-0.633953,0.150897,-1.018121,0.379797
3,-0.536830,-0.487177,-0.648013,0.379797
4,1.723463,0.882345,0.153886,0.379797
...,...,...,...,...
889,-0.635223,-0.734606,-0.011424,-0.324419
890,-0.515759,-0.737555,2.482164,-1.028635
891,-0.686238,-0.732374,-0.973298,-1.028635
892,1.163832,0.134446,0.661571,1.084014


0      0
1      0
2      0
3      0
4      0
      ..
889    1
890    1
891    1
892    1
893    1
Name: digitized, Length: 894, dtype: int64

In [9]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_SMOTE, y_train_SMOTE)

y_train_pred = classification.predict(X_train_SMOTE)
y_test_pred = classification.predict(X_test_transf)

cm_train = confusion_matrix(y_train_SMOTE, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[330, 117],
       [194, 253]], dtype=int64)

array([[89, 31],
       [11, 11]], dtype=int64)

In [10]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.652
The accuracy in the TEST  set is: 0.704


In [11]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.684
The precission in the TEST  set is: 0.262


In [12]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.566
The recall in the TEST  set is: 0.500


In [13]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.62
The F1-score for the TEST set is 0.34


In [14]:
#SECOND MODEL version 2 : with other choosen variables; scalled with MinMax Scaler

#X y split
X3 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'digitized','ratio_green_area'], axis = 1) 
#here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y3 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size = 0.2, random_state = 19)


In [15]:
X_train

Unnamed: 0,green_roof_area,building_area,construction_year,borough,area_type
161,2112,18066,1990,BX,industrial
104,6043,9654,1990,MN,residential
205,214,24772,1960,MN,commercial
456,570,7784,1960,MN,commercial
129,8855,44246,1960,BK,residential
...,...,...,...,...,...
308,4412,35714,1990,BK,residential
19,2807,13242,1930,BK,industrial
354,3071,28794,1990,MN,industrial
622,391,7973,1960,MN,commercial


In [16]:
X_test.iloc[[100],[2]]

Unnamed: 0,construction_year
276,1930


In [17]:
X_test.iloc[[100],[3]]

Unnamed: 0,borough
276,BK


In [18]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[2]] = X_test.iloc[[100],[2]].replace('1930','1800')

X_test.iloc[[100],[3]] = X_test.iloc[[100],[3]].replace('BK','SI')

In [19]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat2 = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat2 = X_test.select_dtypes(['object'])

In [20]:
#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf


transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area
0,-0.624342,1.516453
1,-0.482557,-0.676631
2,-0.409202,-0.655677
3,-0.589306,-0.703542
4,0.031205,-0.193345
...,...,...
137,-0.641859,-0.629714
138,-0.549344,-0.719316
139,2.082151,3.063293
140,-0.427267,-0.141113


In [21]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc2 = encoder.fit_transform(X_train_cat2).toarray()
X_train_cat_enc2 = pd.DataFrame(x_train_cat_enc2, columns = encoder.get_feature_names_out())
X_train_cat_enc2

x_test_cat_enc2 = encoder.fit_transform(X_test_cat2).toarray()
X_test_cat_enc2 = pd.DataFrame(x_test_cat_enc2, columns = encoder.get_feature_names_out())
X_test_cat_enc2


Unnamed: 0,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
X_train_cat_enc2.shape

(568, 15)

In [23]:
#concat

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc2.reset_index(drop = True, inplace = True)
X_test_cat_enc2.reset_index(drop = True, inplace = True)

X_train_treated5 = pd.concat([X_train_num_transf, X_train_cat_enc2], axis = 1)
X_train_treated5
X_test_treated5 = pd.concat([X_test_num_transf, X_test_cat_enc2], axis = 1)
X_test_treated5

Unnamed: 0,green_roof_area,building_area,construction_year_1800,construction_year_1850,construction_year_1900,construction_year_1930,construction_year_1960,construction_year_1990,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,-0.624342,1.516453,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.482557,-0.676631,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.409202,-0.655677,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.589306,-0.703542,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.031205,-0.193345,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,-0.641859,-0.629714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,-0.549344,-0.719316,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,2.082151,3.063293,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,-0.427267,-0.141113,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE2,y_train_SMOTE2 = sm.fit_resample(X_train_treated5,y_train)

In [25]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_SMOTE2, y_train_SMOTE2)

y_train_pred = classification.predict(X_train_SMOTE2)
y_test_pred = classification.predict(X_test_treated5)

cm_train = confusion_matrix(y_train_SMOTE2, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[315, 132],
       [142, 305]], dtype=int64)

array([[87, 33],
       [10, 12]], dtype=int64)

In [26]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE2, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.694
The accuracy in the TEST  set is: 0.697


In [27]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE2, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.698
The precission in the TEST  set is: 0.267


In [28]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE2, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.682
The recall in the TEST  set is: 0.545


In [29]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE2,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.69
The F1-score for the TEST set is 0.36


In [31]:
X3

Unnamed: 0,green_roof_area,building_area,construction_year,borough,area_type
0,971,14057,1900,BK,public
1,696,4463,1900,MN,commercial
2,293,13217,1900,MN,commercial
3,759,4311,1900,MN,commercial
4,7204,35891,1990,BX,commercial
...,...,...,...,...,...
705,1525,6414,1800,MN,residential
706,343,2941,1990,MN,public
707,309,4350,1990,MN,industrial
708,8139,20051,1960,MN,commercial


In [32]:
#SECOND MODEL version 2 : with other choosen variables;

#X y split
X4 = df.drop(columns = ['xcoord', 'ycoord','roof_height','ground_elev',\
                       'digitized','ratio_green_area','construction_year'], axis = 1) 
#here we skip the xcoord and ycoord because powert transform won't work on that variables(too large range)
y4 = df['digitized']

#train test split

X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size = 0.2, random_state = 19)


In [33]:
X_test.iloc[[100],[2]]

Unnamed: 0,borough
276,BK


In [35]:
#problem with one hot encoding
 #since the data set is little, when the one hot encoding was done, our train set and test set had different shapes.
    #this means that in the test set there isn't all the possible values that exists in the train set.
#It was found that the values that cause this situation are: borough_SI and construction_year_1800

#to fix this,  we will replace the existing values for this ones, in a random choosen row of the X_test

X_test.iloc[[100],[2]] = X_test.iloc[[100],[2]].replace('BK','SI')


In [36]:
#split numericals and categoricals

X_train_num = X_train.select_dtypes([np.number])
X_train_cat2 = X_train.select_dtypes(['object'])

X_test_num = X_test.select_dtypes([np.number])
X_test_cat2 = X_test.select_dtypes(['object'])

In [37]:
#transformer operations
 #since we have encoded the cateogrical columns we will transform all the variables. so we won't split X_train into jnumerical and cateogrical

transformer = StandardScaler()
transformer.fit(X_train_num)
x_train_num_transf = transformer.transform(X_train_num)
X_train_num_transf = pd.DataFrame(x_train_num_transf, columns = X_train_num.columns)
X_train_num_transf


transformer.fit(X_test_num)
x_test_num_transf = transformer.transform(X_test_num)
X_test_num_transf = pd.DataFrame(x_test_num_transf, columns = X_test_num.columns)
X_test_num_transf

Unnamed: 0,green_roof_area,building_area
0,-0.624342,1.516453
1,-0.482557,-0.676631
2,-0.409202,-0.655677
3,-0.589306,-0.703542
4,0.031205,-0.193345
...,...,...
137,-0.641859,-0.629714
138,-0.549344,-0.719316
139,2.082151,3.063293
140,-0.427267,-0.141113


In [38]:
#Encoding Categoricals

encoder = OneHotEncoder()
x_train_cat_enc2 = encoder.fit_transform(X_train_cat2).toarray()
X_train_cat_enc2 = pd.DataFrame(x_train_cat_enc2, columns = encoder.get_feature_names_out())
X_train_cat_enc2

x_test_cat_enc2 = encoder.fit_transform(X_test_cat2).toarray()
X_test_cat_enc2 = pd.DataFrame(x_test_cat_enc2, columns = encoder.get_feature_names_out())
X_test_cat_enc2


Unnamed: 0,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
#concat

X_train_num_transf.reset_index(drop = True, inplace = True)
X_test_num_transf.reset_index(drop = True, inplace = True)

X_train_cat_enc2.reset_index(drop = True, inplace = True)
X_test_cat_enc2.reset_index(drop = True, inplace = True)

X_train_treated5 = pd.concat([X_train_num_transf, X_train_cat_enc2], axis = 1)
X_train_treated5
X_test_treated5 = pd.concat([X_test_num_transf, X_test_cat_enc2], axis = 1)
X_test_treated5

Unnamed: 0,green_roof_area,building_area,borough_BK,borough_BX,borough_MN,borough_QN,borough_SI,area_type_commercial,area_type_industrial,area_type_public,area_type_residential
0,-0.624342,1.516453,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.482557,-0.676631,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.409202,-0.655677,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.589306,-0.703542,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.031205,-0.193345,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
137,-0.641859,-0.629714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
138,-0.549344,-0.719316,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
139,2.082151,3.063293,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
140,-0.427267,-0.141113,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [40]:
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE2,y_train_SMOTE2 = sm.fit_resample(X_train_treated5,y_train)

In [41]:
#Logistic model

classification = LogisticRegression(random_state=19, solver='lbfgs')
classification.fit(X_train_SMOTE2, y_train_SMOTE2)

y_train_pred = classification.predict(X_train_SMOTE2)
y_test_pred = classification.predict(X_test_treated5)

cm_train = confusion_matrix(y_train_SMOTE2, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

display(cm_train)
display(cm_test)

array([[324, 123],
       [174, 273]], dtype=int64)

array([[88, 32],
       [12, 10]], dtype=int64)

In [42]:
print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train_SMOTE2, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))

The accuracy in the TRAIN set is: 0.668
The accuracy in the TEST  set is: 0.690


In [43]:
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train_SMOTE2, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))

The precission in the TRAIN set is: 0.689
The precission in the TEST  set is: 0.238


In [44]:
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train_SMOTE2, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))

The recall in the TRAIN set is: 0.611
The recall in the TEST  set is: 0.455


In [45]:
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train_SMOTE2,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The F1-score for the TRAIN set is 0.65
The F1-score for the TEST set is 0.31


In [30]:
#CONCLUSIONS: With smote to treat the imbalance, we got very significant improves in the recall and f1.
            # however in precision we didn't have any improve.
            # So with this results we do not have a acceptable model.
            # We could apply another method to treat the class imbalance but maybe with this data frame
            # we can get no better results because of its size
    
  
    
# NEXT_STEPS:
            #Use another mother like KNN classifier
    