# Application of Big Data Project

## Import

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd



# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#machine learning librairies
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

## Read Data

In [2]:
app_train = pd.read_csv("application_train.csv")
print(app_train.shape)
app_test = pd.read_csv("application_test.csv")
print(app_test.shape)

(307511, 122)
(48744, 121)


## Data exploration

In [3]:
app_train.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [4]:
app_test.describe()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,48744.0,48744.0,48744.0,48744.0,48720.0,48744.0,48744.0,48744.0,48744.0,48744.0,...,48744.0,48744.0,48744.0,48744.0,42695.0,42695.0,42695.0,42695.0,42695.0,42695.0
mean,277796.67635,0.397054,178431.8,516740.4,29426.240209,462618.8,0.021226,-16068.084605,67485.366322,-4967.652716,...,0.001559,0.0,0.0,0.0,0.002108,0.001803,0.002787,0.009299,0.546902,1.983769
std,103169.547296,0.709047,101522.6,365397.0,16016.368315,336710.2,0.014428,4325.900393,144348.507136,3552.612035,...,0.039456,0.0,0.0,0.0,0.046373,0.046132,0.054037,0.110924,0.693305,1.838873
min,100001.0,0.0,26941.5,45000.0,2295.0,45000.0,0.000253,-25195.0,-17463.0,-23722.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,188557.75,0.0,112500.0,260640.0,17973.0,225000.0,0.010006,-19637.0,-2910.0,-7459.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,277549.0,0.0,157500.0,450000.0,26199.0,396000.0,0.01885,-15785.0,-1293.0,-4490.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,367555.5,1.0,225000.0,675000.0,37390.5,630000.0,0.028663,-12496.0,-296.0,-1901.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
max,456250.0,20.0,4410000.0,2245500.0,180576.0,2245500.0,0.072508,-7338.0,365243.0,0.0,...,1.0,0.0,0.0,0.0,2.0,2.0,2.0,6.0,7.0,17.0


In [5]:
app_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

We can see that there are 10 times more 0 than 1. The dataset is not balanced

Let's see the missing values

In [6]:
def calculate_missing_values(df):
    
    # Find the ratio of 
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    

In the train dataset

In [7]:
missing_values = calculate_missing_values(app_train)
missing_values

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MEDI,213514,69.4
NONLIVINGAPARTMENTS_MODE,213514,69.4
...,...,...
EXT_SOURCE_2,660,0.2
AMT_GOODS_PRICE,278,0.1
AMT_ANNUITY,12,0.0
CNT_FAM_MEMBERS,2,0.0


In the test dataset

In [8]:
missing_values = calculate_missing_values(app_test)
missing_values

Your selected dataframe has 121 columns.
There are 64 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MODE,33495,68.7
COMMONAREA_MEDI,33495,68.7
COMMONAREA_AVG,33495,68.7
NONLIVINGAPARTMENTS_MEDI,33347,68.4
NONLIVINGAPARTMENTS_AVG,33347,68.4
...,...,...
OBS_60_CNT_SOCIAL_CIRCLE,29,0.1
DEF_30_CNT_SOCIAL_CIRCLE,29,0.1
OBS_30_CNT_SOCIAL_CIRCLE,29,0.1
AMT_ANNUITY,24,0.0


Let's see the number of classes in each categorical columns

In [9]:
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [10]:
# Number of unique classes in each object column
app_test.select_dtypes('object').apply(pd.Series.nunique)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    2
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               7
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             5
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

Now we'll transform the categorical values with One-Hot-Encoding

In [11]:
train = pd.get_dummies(app_train,drop_first=True)
train.shape

(307511, 230)

In [12]:
test = pd.get_dummies(app_test,drop_first=True)
test.shape

(48744, 226)

Let's see if there are some strange values

In [13]:
app_train.select_dtypes('int64').columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21'],
      dtype='object')

In [14]:
app_test.select_dtypes('int64').columns

Index(['SK_ID_CURR', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21'],
      dtype='object')

In [15]:
train['CNT_CHILDREN'].describe()

count    307511.000000
mean          0.417052
std           0.722121
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          19.000000
Name: CNT_CHILDREN, dtype: float64

In [16]:
test['CNT_CHILDREN'].describe()

count    48744.000000
mean         0.397054
std          0.709047
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         20.000000
Name: CNT_CHILDREN, dtype: float64

In [17]:
(-1*train['DAYS_BIRTH']/365).describe()

count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH, dtype: float64

In [18]:
(-1*test['DAYS_BIRTH']/365).describe()

count    48744.000000
mean        44.022150
std         11.851782
min         20.104110
25%         34.235616
50%         43.246575
75%         53.800000
max         69.027397
Name: DAYS_BIRTH, dtype: float64

In [19]:
(train['DAYS_EMPLOYED']/365).describe()

count    307511.000000
mean        174.835742
std         387.056895
min         -49.073973
25%          -7.561644
50%          -3.323288
75%          -0.791781
max        1000.665753
Name: DAYS_EMPLOYED, dtype: float64

In [20]:
(test['DAYS_EMPLOYED']/365).describe()

count    48744.000000
mean       184.891415
std        395.475362
min        -47.843836
25%         -7.972603
50%         -3.542466
75%         -0.810959
max       1000.665753
Name: DAYS_EMPLOYED, dtype: float64

Wow ! The maximum values is 1000 years. How many in the train dataset ?

In [21]:
outliers = train[(train['DAYS_EMPLOYED']/365>=100) & (train['DAYS_EMPLOYED']/365<=1100)]
len(outliers)

55374

How many in the test dataset ?

In [22]:
outliers_test = test[(test['DAYS_EMPLOYED']/365>=100) & (test['DAYS_EMPLOYED']/365<=1100)]
len(outliers_test)

9274

We will set a missing value for the anomalies and replace it with the median or the mean before ML.

In [23]:
train[train['DAYS_EMPLOYED'] > 300000].DAYS_EMPLOYED

8         365243
11        365243
23        365243
38        365243
43        365243
           ...  
307469    365243
307483    365243
307487    365243
307505    365243
307507    365243
Name: DAYS_EMPLOYED, Length: 55374, dtype: int64

In [24]:
test[test['DAYS_EMPLOYED'] > 300000].DAYS_EMPLOYED

10       365243
17       365243
22       365243
26       365243
36       365243
          ...  
48717    365243
48728    365243
48731    365243
48735    365243
48736    365243
Name: DAYS_EMPLOYED, Length: 9274, dtype: int64

In [25]:
train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace({365243: np.nan})

test['DAYS_EMPLOYED_ANOM'] = test["DAYS_EMPLOYED"] == 365243
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].replace({365243: np.nan})

In [26]:
(train['DAYS_EMPLOYED']/365).describe()

count    252137.000000
mean         -6.531971
std           6.406466
min         -49.073973
25%          -8.698630
50%          -4.515068
75%          -2.101370
max           0.000000
Name: DAYS_EMPLOYED, dtype: float64

In [27]:
(test['DAYS_EMPLOYED']/365).describe()

count    39470.000000
mean        -6.785586
std          6.323189
min        -47.843836
25%         -9.119863
50%         -4.835616
75%         -2.358904
max         -0.002740
Name: DAYS_EMPLOYED, dtype: float64

Let's see the correlation between variables

In [28]:
corr = train.corr()['TARGET']
corr

SK_ID_CURR                        -0.002108
TARGET                             1.000000
CNT_CHILDREN                       0.019187
AMT_INCOME_TOTAL                  -0.003982
AMT_CREDIT                        -0.030369
                                     ...   
WALLSMATERIAL_MODE_Panel          -0.033119
WALLSMATERIAL_MODE_Stone, brick   -0.012657
WALLSMATERIAL_MODE_Wooden          0.007946
EMERGENCYSTATE_MODE_Yes            0.004829
DAYS_EMPLOYED_ANOM                -0.045987
Name: TARGET, Length: 231, dtype: float64

We will replace the missing values by the median for the 10 most positively correlated and negatively correlated features and we will drop the others

In [29]:
print(corr.sort_values().tail(20))
print(corr.sort_values().head(20))

DEF_60_CNT_SOCIAL_CIRCLE                             0.031276
DEF_30_CNT_SOCIAL_CIRCLE                             0.032248
LIVE_CITY_NOT_WORK_CITY                              0.032518
OWN_CAR_AGE                                          0.037612
DAYS_REGISTRATION                                    0.041975
OCCUPATION_TYPE_Laborers                             0.043019
FLAG_DOCUMENT_3                                      0.044346
REG_CITY_NOT_LIVE_CITY                               0.044395
FLAG_EMP_PHONE                                       0.045982
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049824
REG_CITY_NOT_WORK_CITY                               0.050994
DAYS_ID_PUBLISH                                      0.051457
CODE_GENDER_M                                        0.054713
DAYS_LAST_PHONE_CHANGE                               0.055218
NAME_INCOME_TYPE_Working                             0.057481
REGION_RATING_CLIENT                                 0.058899
REGION_R

In [30]:

most_corr_features = ["DEF_60_CNT_SOCIAL_CIRCLE"  ,                           
"DEF_30_CNT_SOCIAL_CIRCLE"                        ,     
"LIVE_CITY_NOT_WORK_CITY"                          ,    
"OWN_CAR_AGE"                                       ,   
"DAYS_REGISTRATION"                                  ,  
"OCCUPATION_TYPE_Laborers"                            , 
"FLAG_DOCUMENT_3"                                      ,
"REG_CITY_NOT_LIVE_CITY"                               ,
"FLAG_EMP_PHONE"                                       ,

"REG_CITY_NOT_WORK_CITY"                               ,
"DAYS_ID_PUBLISH"                                      ,
"CODE_GENDER_M"                                        ,
"DAYS_LAST_PHONE_CHANGE"                               ,
"NAME_INCOME_TYPE_Working"                             ,
"REGION_RATING_CLIENT"                                 ,
"REGION_RATING_CLIENT_W_CITY"                          ,
"DAYS_EMPLOYED"                                        ,
"DAYS_BIRTH"                                         ,
                      
                      "EXT_SOURCE_3"                     ,      
"EXT_SOURCE_2"                           ,
"EXT_SOURCE_1"                           ,

"NAME_INCOME_TYPE_Pensioner"             ,
"DAYS_EMPLOYED_ANOM"                     ,
"ORGANIZATION_TYPE_XNA"                  ,
"FLOORSMAX_AVG"                          ,
"FLOORSMAX_MEDI"                         ,
"FLOORSMAX_MODE"                         ,
"AMT_GOODS_PRICE"                        ,
"REGION_POPULATION_RELATIVE"             ,
"ELEVATORS_AVG"                          ,
"ELEVATORS_MEDI"                         ,
"FLOORSMIN_AVG"                          ,
"FLOORSMIN_MEDI"                         ,
"WALLSMATERIAL_MODE_Panel"               ,
"LIVINGAREA_AVG"                         ,
"LIVINGAREA_MEDI"                        ,
"FLOORSMIN_MODE"                         
                      
                                              ]

imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [31]:
train[most_corr_features] = imputer.fit_transform(train[most_corr_features])

In [32]:
train[most_corr_features]

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE
0,2.0,2.0,0.0,9.0,-3648.0,1.0,1.0,0.0,1.0,0.0,...,351000.0,0.018801,0.00,0.00,0.1250,0.1250,0.0,0.0190,0.0193,0.1250
1,0.0,0.0,0.0,9.0,-1186.0,0.0,1.0,0.0,1.0,0.0,...,1129500.0,0.003541,0.08,0.08,0.3333,0.3333,0.0,0.0549,0.0558,0.3333
2,0.0,0.0,0.0,26.0,-4260.0,1.0,0.0,0.0,1.0,0.0,...,135000.0,0.010032,0.00,0.00,0.2083,0.2083,0.0,0.0745,0.0749,0.2083
3,0.0,0.0,0.0,9.0,-9833.0,1.0,1.0,0.0,1.0,0.0,...,297000.0,0.008019,0.00,0.00,0.2083,0.2083,0.0,0.0745,0.0749,0.2083
4,0.0,0.0,1.0,9.0,-4311.0,0.0,0.0,0.0,1.0,1.0,...,513000.0,0.028663,0.00,0.00,0.2083,0.2083,0.0,0.0745,0.0749,0.2083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0.0,0.0,0.0,9.0,-8456.0,0.0,0.0,0.0,1.0,0.0,...,225000.0,0.032561,0.22,0.22,0.2708,0.2708,0.0,0.1965,0.2001,0.0417
307507,0.0,0.0,0.0,9.0,-4388.0,0.0,1.0,0.0,0.0,0.0,...,225000.0,0.025164,0.00,0.00,0.1250,0.1250,0.0,0.0257,0.0261,0.1250
307508,0.0,0.0,1.0,9.0,-6737.0,0.0,1.0,0.0,1.0,1.0,...,585000.0,0.005002,0.00,0.00,0.2083,0.2083,1.0,0.9279,0.9445,0.2083
307509,0.0,0.0,0.0,9.0,-2562.0,1.0,1.0,1.0,1.0,1.0,...,319500.0,0.005313,0.00,0.00,0.2083,0.2083,0.0,0.0061,0.0062,0.2083


In [33]:
df_train = train[most_corr_features]

In [34]:
df_test = test[most_corr_features]

In [35]:
df_train['TARGET'] = app_train['TARGET']

## Feature Scaling/Normalization

In [36]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [37]:
scaler = MinMaxScaler(feature_range = (0, 1))
imputer.fit(df_train.drop(['TARGET'],axis=1))
X = imputer.transform(df_train.drop(['TARGET'],axis=1))
X

array([[2.    , 2.    , 0.    , ..., 0.019 , 0.0193, 0.125 ],
       [0.    , 0.    , 0.    , ..., 0.0549, 0.0558, 0.3333],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083],
       ...,
       [0.    , 0.    , 1.    , ..., 0.9279, 0.9445, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0061, 0.0062, 0.2083],
       [0.    , 0.    , 1.    , ..., 0.0791, 0.0805, 0.2083]])

In [38]:
test_X = imputer.transform(df_test)
test_X

array([[0.    , 0.    , 0.    , ..., 0.0505, 0.0514, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083],
       ...,
       [0.    , 0.    , 0.    , ..., 0.1383, 0.1408, 0.2083],
       [0.    , 0.    , 1.    , ..., 0.1563, 0.1591, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083]])

In [39]:
X = scaler.fit_transform(X)

array([[0.08333333, 0.05882353, 0.        , ..., 0.019     , 0.0193    ,
        0.125     ],
       [0.        , 0.        , 0.        , ..., 0.0549    , 0.0558    ,
        0.3333    ],
       [0.        , 0.        , 0.        , ..., 0.0745    , 0.0749    ,
        0.2083    ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.9279    , 0.9445    ,
        0.2083    ],
       [0.        , 0.        , 0.        , ..., 0.0061    , 0.0062    ,
        0.2083    ],
       [0.        , 0.        , 1.        , ..., 0.0791    , 0.0805    ,
        0.2083    ]])

In [40]:
test_X = scaler.fit_transform(test_X)

array([[0.    , 0.    , 0.    , ..., 0.0505, 0.0514, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083],
       ...,
       [0.    , 0.    , 0.    , ..., 0.1383, 0.1408, 0.2083],
       [0.    , 0.    , 1.    , ..., 0.1563, 0.1591, 0.2083],
       [0.    , 0.    , 0.    , ..., 0.0745, 0.0749, 0.2083]])

## Machine Learning

Splitting the train dataset into train and test variables to fit the model

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, df_train.TARGET, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(206032, 37)
(101479, 37)
(206032,)
(101479,)


## XGBOOST

Initialize the model

In [46]:
model = XGBClassifier()

Fit the model

In [47]:
model.fit(X_train,y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


Predict

In [48]:
preds = model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.283516


In [50]:
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1 score: ",f1_score(y_test, preds))

Accuracy: 91.96%
F1 score:  0.051401325735550646


predict with the test dataset

In [51]:
preds = model.predict(test_X)
preds

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [52]:
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,0
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,0
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0


# Random Forest 

In [53]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [54]:
preds = clf.predict(X_test)

In [56]:
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1 score: ",f1_score(y_test, preds))

Accuracy: 92.00%
F1 score:  0.0


In [57]:
preds = clf.predict(test_X)
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,0
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,0
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0


## Gradient Boosting

In [58]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("F1 score: ",f1_score(y_test, gb_clf.predict(X_test)))    

Learning rate:  0.05
Accuracy score (training): 0.919
F1 score:  0.0
Learning rate:  0.075
Accuracy score (training): 0.919
F1 score:  0.0
Learning rate:  0.1
Accuracy score (training): 0.919
F1 score:  0.0
Learning rate:  0.25
Accuracy score (training): 0.919
F1 score:  0.0
Learning rate:  0.5
Accuracy score (training): 0.919
F1 score:  0.0004926715112698608
Learning rate:  0.75
Accuracy score (training): 0.919
F1 score:  0.02535581868197584
Learning rate:  1
Accuracy score (training): 0.918
F1 score:  0.03852439878589774


In [59]:
preds = gb_clf.predict(test_X)
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,0
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,0
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0


Now, we will do the same steps after balancing the dataset

## Split dataset 50%-1 and 50%-0


In [60]:
tmp_0 = df_train[df_train['TARGET'] == 0]
tmp_1 = df_train[df_train['TARGET'] == 1]

(24825, 38)

In [61]:
tmp_0 = shuffle(tmp_0)
tmp_0 = tmp_0.head(24825)

(24825, 38)

In [62]:
frames = [tmp_0,tmp_1]
df = pd.concat(frames)
df.shape

(49650, 38)

In [63]:
scaler = MinMaxScaler(feature_range = (0, 1))
imputer.fit(df.drop(['TARGET'],axis=1))
X = imputer.transform(df.drop(['TARGET'],axis=1))
X = scaler.fit_transform(X)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, df.TARGET, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(33265, 37)
(16385, 37)
(33265,)
(16385,)


## XGBOOST

In [65]:
model = XGBClassifier()
model.fit(X_train,y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [66]:
preds = model.predict(X_test)
preds

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

In [67]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.572591


In [69]:
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1 score: ",f1_score(y_test, preds))

Accuracy: 67.21%
F1 score:  0.6725987323256947


In [70]:
preds = clf.predict(test_X)
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,0
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,0
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0


## RANDOM FOREST

In [71]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [72]:
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1 score: ",f1_score(y_test, preds))

Accuracy: 65.65%
F1 score:  0.6623882924488695


In [73]:
preds = clf.predict(test_X)
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,0
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,1
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0


## GRADIENT BOOSTING

In [74]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("F1 score: ",f1_score(y_test, gb_clf.predict(X_test)))


Learning rate:  0.05
Accuracy score (training): 0.637
F1 score:  0.649967584133907
Learning rate:  0.075
Accuracy score (training): 0.642
F1 score:  0.6522154544368804
Learning rate:  0.1
Accuracy score (training): 0.646
F1 score:  0.6530931871574001
Learning rate:  0.25
Accuracy score (training): 0.653
F1 score:  0.6570720203180747
Learning rate:  0.5
Accuracy score (training): 0.658
F1 score:  0.6563659589999392
Learning rate:  0.75
Accuracy score (training): 0.659
F1 score:  0.658348043094528
Learning rate:  1
Accuracy score (training): 0.660
F1 score:  0.6637957178382968


In [75]:
preds = gb_clf.predict(test_X)
df_res = pd.DataFrame(df_test)
df_res['TARGET'] = preds
df_res

Unnamed: 0,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,OWN_CAR_AGE,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_WORK_CITY,...,REGION_POPULATION_RELATIVE,ELEVATORS_AVG,ELEVATORS_MEDI,FLOORSMIN_AVG,FLOORSMIN_MEDI,WALLSMATERIAL_MODE_Panel,LIVINGAREA_AVG,LIVINGAREA_MEDI,FLOORSMIN_MODE,TARGET
0,0.0,0.0,0,,-5170.0,0,1,0,1,0,...,0.018850,,,,,0,0.0505,0.0514,,1
1,0.0,0.0,0,,-9118.0,0,1,0,1,0,...,0.035792,,,,,0,,,,0
2,0.0,0.0,0,5.0,-2175.0,0,0,0,1,0,...,0.019101,,,,,0,,,,0
3,0.0,0.0,0,,-2000.0,0,1,0,1,0,...,0.026392,0.32,0.32,0.0417,0.0417,1,0.3673,0.3739,0.0417,0
4,0.0,0.0,1,16.0,-4000.0,0,1,0,1,1,...,0.010032,,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.0,0.0,0,,-9094.0,0,0,0,1,0,...,0.002042,,,,,0,,,,0
48740,0.0,0.0,1,,-3015.0,0,1,0,1,1,...,0.035792,,,,,0,,,,0
48741,0.0,0.0,0,4.0,-2681.0,0,1,0,1,0,...,0.026392,0.16,0.16,,,0,0.1383,0.1408,,0
48742,0.0,0.0,1,,-1461.0,0,1,0,1,1,...,0.018850,0.16,0.16,,,1,0.1563,0.1591,,0
