In [74]:
# importing dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

import pymongo
import os


In [75]:
# connecting to MongoDB
conn = 'mongodb://localhost:27017'

client = pymongo.MongoClient(conn)

In [76]:
# creating training and testing dataframes
df = pd.read_csv(Path('Resources/application_train.csv'))

In [77]:
df

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,_id
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,6288ff0e5824e1f88b4e59b1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.7960,0.7987,0.8040,6288ff0e5824e1f88b4e59b2
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,6288ff0e5824e1f88b4e59b3
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,6288ff0e5824e1f88b4e59b4
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,6288ff0e5824e1f88b4e59b5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238485,27558.0,254700.0,225000.0,157500.0,,,,,,,...,0.2898,"Stone, brick",THURSDAY,0.9876,0.9876,0.9782,0.8300,0.8323,0.7125,628902df5824e1f88b530ae3
238486,12001.5,269550.0,225000.0,72000.0,,,,,,,...,0.0214,"Stone, brick",MONDAY,0.9727,0.9727,0.9727,0.6260,0.6310,0.6406,628902df5824e1f88b530ae4
238487,29979.0,677664.0,585000.0,153000.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.7970,Panel,THURSDAY,0.9816,0.9816,0.9816,0.7484,0.7518,0.7583,628902df5824e1f88b530ae5
238488,20205.0,370107.0,319500.0,171000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0086,"Stone, brick",WEDNESDAY,0.9771,0.9771,0.9772,,,,628902df5824e1f88b530ae6


In [78]:
# removing columns that are less relevant to analysis
df2 = df.drop(['FLAG_MOBIL', "FLAG_EMP_PHONE","FLAG_WORK_PHONE", "FLAG_CONT_MOBILE", "FLAG_EMAIL", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_WORK_CITY", "LIVE_CITY_NOT_WORK_CITY", "FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", 
               "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6", "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11", "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16", "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19",
               "FLAG_DOCUMENT_21"], axis=1)

df2

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,_id
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,6288ff0e5824e1f88b4e59b1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.7960,0.7987,0.8040,6288ff0e5824e1f88b4e59b2
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,6288ff0e5824e1f88b4e59b3
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,6288ff0e5824e1f88b4e59b4
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,6288ff0e5824e1f88b4e59b5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238485,27558.0,254700.0,225000.0,157500.0,,,,,,,...,0.2898,"Stone, brick",THURSDAY,0.9876,0.9876,0.9782,0.8300,0.8323,0.7125,628902df5824e1f88b530ae3
238486,12001.5,269550.0,225000.0,72000.0,,,,,,,...,0.0214,"Stone, brick",MONDAY,0.9727,0.9727,0.9727,0.6260,0.6310,0.6406,628902df5824e1f88b530ae4
238487,29979.0,677664.0,585000.0,153000.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.7970,Panel,THURSDAY,0.9816,0.9816,0.9816,0.7484,0.7518,0.7583,628902df5824e1f88b530ae5
238488,20205.0,370107.0,319500.0,171000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0086,"Stone, brick",WEDNESDAY,0.9771,0.9771,0.9772,,,,628902df5824e1f88b530ae6


In [79]:
df3 = df2.dropna(axis=0)

df3

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,_id
71,24435.0,573628.5,463500.0,103500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.1324,"Stone, brick",MONDAY,0.9861,0.9861,0.9861,0.8096,0.8121,0.8171,6288ff0e5824e1f88b4e59f8
124,16789.5,260725.5,198000.0,202500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0710,Panel,TUESDAY,0.9801,0.9801,0.9801,0.7280,0.7316,0.7387,6288ff0e5824e1f88b4e5a2d
152,53329.5,675000.0,675000.0,202500.0,0.0,0.0,1.0,0.0,0.0,4.0,...,0.7334,Monolithic,MONDAY,0.9945,0.9945,0.9945,0.9252,0.9262,0.9281,6288ff0e5824e1f88b4e5a49
161,24781.5,263686.5,238500.0,162000.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.4064,Panel,THURSDAY,0.9891,0.9891,0.9891,0.8504,0.8524,0.8563,6288ff0e5824e1f88b4e5a52
255,31032.0,1019205.0,774000.0,225000.0,0.0,0.0,6.0,0.0,0.0,1.0,...,0.1874,Panel,MONDAY,0.9916,0.9916,0.9916,0.8844,0.8859,0.8889,6288ff0e5824e1f88b4e5ab0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238337,16051.5,361462.5,274500.0,112500.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0594,"Stone, brick",WEDNESDAY,0.9970,0.9970,0.9965,0.9524,0.9530,0.9543,628902df5824e1f88b530a4f
238338,21906.0,675000.0,675000.0,99000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1444,Panel,FRIDAY,0.9851,0.9851,0.9851,0.7960,0.7987,0.8040,628902df5824e1f88b530a50
238386,47673.0,711454.5,643500.0,261000.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0454,Panel,MONDAY,0.9871,0.9871,0.9871,0.8232,0.8256,0.8301,628902df5824e1f88b530a80
238435,15075.0,270000.0,270000.0,94500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0061,"Stone, brick",TUESDAY,0.9767,0.9767,0.9767,0.6804,0.6847,0.6929,628902df5824e1f88b530ab1


In [80]:
# counting the number of nulls in each column
null_data= print(df2.isnull().sum())

AMT_ANNUITY                          9
AMT_CREDIT                           0
AMT_GOODS_PRICE                    193
AMT_INCOME_TOTAL                     0
AMT_REQ_CREDIT_BUREAU_DAY        32087
                                 ...  
YEARS_BEGINEXPLUATATION_MODE    116319
YEARS_BUILD_AVG                 158526
YEARS_BUILD_MEDI                158526
YEARS_BUILD_MODE                158526
_id                                  0
Length: 93, dtype: int64


## Exploring Impact of Dropping Additional Columns

In [81]:
investigate_df = df[["TARGET", "COMMONAREA_AVG", "COMMONAREA_MODE", "COMMONAREA_MEDI", "APARTMENTS_AVG", 
                     "APARTMENTS_MODE", "BASEMENTAREA_AVG", "BASEMENTAREA_MEDI", "BASEMENTAREA_MODE", 
                     "LIVINGAPARTMENTS_AVG", "LIVINGAPARTMENTS_MEDI", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MEDI",
                     "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_AVG",
                     "NONLIVINGAREA_MODE", "TOTALAREA_MODE", "WALLSMATERIAL_MODE", "YEARS_BEGINEXPLUATATION_AVG",
                     "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_AVG", "YEARS_BUILD_MEDI",
                     "YEARS_BUILD_MODE"]]

investigate_df

Unnamed: 0,TARGET,COMMONAREA_AVG,COMMONAREA_MODE,COMMONAREA_MEDI,APARTMENTS_AVG,APARTMENTS_MODE,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,BASEMENTAREA_MODE,LIVINGAPARTMENTS_AVG,...,NONLIVINGAREA_AVG,NONLIVINGAREA_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE
0,1,0.0143,0.0144,0.0144,0.0247,0.0252,0.0369,0.0369,0.0383,0.0202,...,0.0000,0.0000,0.0149,"Stone, brick",0.9722,0.9722,0.9722,0.6192,0.6243,0.6341
1,0,0.0605,0.0497,0.0608,0.0959,0.0924,0.0529,0.0529,0.0538,0.0773,...,0.0098,0.0000,0.0714,Block,0.9851,0.9851,0.9851,0.7960,0.7987,0.8040
2,0,,,,,,,,,,...,,,,,,,,,,
3,0,,,,,,,,,,...,,,,,,,,,,
4,0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238485,0,0.0202,0.0172,0.0203,0.2021,0.1008,0.0887,0.0887,0.0172,0.1484,...,0.1095,0.0125,0.2898,"Stone, brick",0.9876,0.9876,0.9782,0.8300,0.8323,0.7125
238486,0,0.0022,0.0022,0.0022,0.0247,0.0252,0.0435,0.0435,0.0451,0.0202,...,0.0000,0.0000,0.0214,"Stone, brick",0.9727,0.9727,0.9727,0.6260,0.6310,0.6406
238487,0,0.0123,0.0124,0.0124,0.1031,0.1050,0.0862,0.0862,0.0894,0.0841,...,0.0000,0.0000,0.7970,Panel,0.9816,0.9816,0.9816,0.7484,0.7518,0.7583
238488,1,,,,0.0124,0.0126,,,,,...,,,0.0086,"Stone, brick",0.9771,0.9771,0.9772,,,


In [82]:
# counting the number of nulls in each column
new_null= print(investigate_df.isnull().sum())

TARGET                               0
COMMONAREA_AVG                  166602
COMMONAREA_MODE                 166602
COMMONAREA_MEDI                 166602
APARTMENTS_AVG                  121032
APARTMENTS_MODE                 121032
BASEMENTAREA_AVG                139598
BASEMENTAREA_MEDI               139598
BASEMENTAREA_MODE               139598
LIVINGAPARTMENTS_AVG            162980
LIVINGAPARTMENTS_MEDI           162980
LIVINGAPARTMENTS_MODE           162980
LIVINGAREA_MEDI                 119699
LIVINGAREA_MODE                 119699
NONLIVINGAPARTMENTS_MEDI        165529
NONLIVINGAPARTMENTS_MODE        165529
NONLIVINGAREA_AVG               131524
NONLIVINGAREA_MODE              131524
TOTALAREA_MODE                  115092
WALLSMATERIAL_MODE              121195
YEARS_BEGINEXPLUATATION_AVG     116319
YEARS_BEGINEXPLUATATION_MEDI    116319
YEARS_BEGINEXPLUATATION_MODE    116319
YEARS_BUILD_AVG                 158526
YEARS_BUILD_MEDI                158526
YEARS_BUILD_MODE         

In [83]:
# creating correlation matrices to determine if columns with multiple null values have any correlation with target variable
investigate_df.corr()

Unnamed: 0,TARGET,COMMONAREA_AVG,COMMONAREA_MODE,COMMONAREA_MEDI,APARTMENTS_AVG,APARTMENTS_MODE,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,BASEMENTAREA_MODE,LIVINGAPARTMENTS_AVG,...,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_AVG,NONLIVINGAREA_MODE,TOTALAREA_MODE,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE
TARGET,1.0,-0.018096,-0.016189,-0.018285,-0.030042,-0.027867,-0.023284,-0.022639,-0.020627,-0.025044,...,-2.2e-05,-0.012694,-0.011923,-0.033588,-0.011833,-0.012174,-0.010894,-0.02208,-0.022219,-0.022158
COMMONAREA_AVG,-0.018096,1.0,0.977732,0.995388,0.536131,0.51244,0.399525,0.398491,0.382132,0.529098,...,0.103665,0.224694,0.214593,0.551336,0.084092,0.066038,0.042478,0.232299,0.231884,0.229168
COMMONAREA_MODE,-0.016189,0.977732,1.0,0.981322,0.528461,0.525522,0.395579,0.399015,0.400542,0.520743,...,0.109151,0.219014,0.226181,0.542254,0.079164,0.06123,0.04059,0.224045,0.22361,0.227319
COMMONAREA_MEDI,-0.018285,0.995388,0.981322,1.0,0.537304,0.515839,0.40109,0.401003,0.38525,0.530137,...,0.104967,0.224918,0.216897,0.551361,0.084311,0.066244,0.042535,0.233035,0.233014,0.230472
APARTMENTS_AVG,-0.030042,0.536131,0.528461,0.537304,1.0,0.973197,0.680625,0.680263,0.662479,0.944137,...,0.187692,0.297423,0.282988,0.892628,0.10018,0.099653,0.094905,0.341443,0.340297,0.33981
APARTMENTS_MODE,-0.027867,0.51244,0.525522,0.515839,0.973197,1.0,0.666895,0.669884,0.679389,0.908986,...,0.186637,0.285522,0.292363,0.864329,0.10102,0.099923,0.101325,0.323679,0.3228,0.330349
BASEMENTAREA_AVG,-0.023284,0.399525,0.395579,0.40109,0.680625,0.666895,1.0,0.993796,0.972864,0.646116,...,0.094141,0.261924,0.25459,0.674147,0.083492,0.074039,0.055809,0.249945,0.249173,0.250573
BASEMENTAREA_MEDI,-0.022639,0.398491,0.399015,0.401003,0.680263,0.669884,0.993796,1.0,0.977886,0.646242,...,0.094933,0.261721,0.257422,0.671398,0.082833,0.073327,0.055252,0.247807,0.247142,0.249309
BASEMENTAREA_MODE,-0.020627,0.382132,0.400542,0.38525,0.662479,0.679389,0.972864,0.977886,1.0,0.624566,...,0.097377,0.253029,0.267113,0.651382,0.078318,0.068746,0.054255,0.235018,0.234446,0.245126
LIVINGAPARTMENTS_AVG,-0.025044,0.529098,0.520743,0.530137,0.944137,0.908986,0.646116,0.646242,0.624566,1.0,...,0.141716,0.289581,0.273138,0.847833,0.138047,0.11341,0.07737,0.334596,0.33338,0.332566


In [84]:
# removing additional columns from dataframe
updated_df= df2.drop(["COMMONAREA_AVG", "COMMONAREA_MODE", "COMMONAREA_MEDI", "APARTMENTS_AVG", 
                     "APARTMENTS_MODE", "BASEMENTAREA_AVG", "BASEMENTAREA_MEDI", "BASEMENTAREA_MODE", 
                     "LIVINGAPARTMENTS_AVG", "LIVINGAPARTMENTS_MEDI", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MEDI",
                     "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_AVG",
                     "NONLIVINGAREA_MODE", "TOTALAREA_MODE", "WALLSMATERIAL_MODE", "YEARS_BEGINEXPLUATATION_AVG",
                     "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_AVG", "YEARS_BUILD_MEDI",
                     "YEARS_BUILD_MODE"], axis=1)

updated_df

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,OCCUPATION_TYPE,ORGANIZATION_TYPE,OWN_CAR_AGE,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,SK_ID_CURR,TARGET,WEEKDAY_APPR_PROCESS_START,_id
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,Laborers,Business Entity Type 3,,0.018801,2,2,100002,1,WEDNESDAY,6288ff0e5824e1f88b4e59b1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Core staff,School,,0.003541,1,1,100003,0,MONDAY,6288ff0e5824e1f88b4e59b2
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Laborers,Government,26.0,0.010032,2,2,100004,0,MONDAY,6288ff0e5824e1f88b4e59b3
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,Laborers,Business Entity Type 3,,0.008019,2,2,100006,0,WEDNESDAY,6288ff0e5824e1f88b4e59b4
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Core staff,Religion,,0.028663,2,2,100007,0,THURSDAY,6288ff0e5824e1f88b4e59b5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238485,27558.0,254700.0,225000.0,157500.0,,,,,,,...,Sales staff,Services,,0.032561,1,1,456251,0,THURSDAY,628902df5824e1f88b530ae3
238486,12001.5,269550.0,225000.0,72000.0,,,,,,,...,,XNA,,0.025164,2,2,456252,0,MONDAY,628902df5824e1f88b530ae4
238487,29979.0,677664.0,585000.0,153000.0,0.0,1.0,1.0,0.0,0.0,1.0,...,Managers,School,,0.005002,3,3,456253,0,THURSDAY,628902df5824e1f88b530ae5
238488,20205.0,370107.0,319500.0,171000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Laborers,Business Entity Type 1,,0.005313,2,2,456254,1,WEDNESDAY,628902df5824e1f88b530ae6


In [85]:
updated_df2 = updated_df.dropna(axis=0)

updated_df2

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,OCCUPATION_TYPE,ORGANIZATION_TYPE,OWN_CAR_AGE,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,SK_ID_CURR,TARGET,WEEKDAY_APPR_PROCESS_START,_id
71,24435.0,573628.5,463500.0,103500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,Laborers,Business Entity Type 3,22.0,0.009657,2,2,100083,0,MONDAY,6288ff0e5824e1f88b4e59f8
124,16789.5,260725.5,198000.0,202500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,Laborers,Self-employed,8.0,0.018850,2,2,100145,0,TUESDAY,6288ff0e5824e1f88b4e5a2d
152,53329.5,675000.0,675000.0,202500.0,0.0,0.0,1.0,0.0,0.0,4.0,...,Managers,Trade: type 7,4.0,0.031329,2,2,100179,0,MONDAY,6288ff0e5824e1f88b4e5a49
161,24781.5,263686.5,238500.0,162000.0,0.0,0.0,0.0,0.0,0.0,3.0,...,Laborers,Government,3.0,0.022625,2,2,100190,0,THURSDAY,6288ff0e5824e1f88b4e5a52
255,31032.0,1019205.0,774000.0,225000.0,0.0,0.0,6.0,0.0,0.0,1.0,...,Laborers,Business Entity Type 3,9.0,0.072508,1,1,100295,1,MONDAY,6288ff0e5824e1f88b4e5ab0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238337,16051.5,361462.5,274500.0,112500.0,0.0,0.0,2.0,0.0,0.0,0.0,...,Security staff,Government,9.0,0.025164,2,2,456083,0,WEDNESDAY,628902df5824e1f88b530a4f
238338,21906.0,675000.0,675000.0,99000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Security staff,Security,14.0,0.020246,3,3,456084,0,FRIDAY,628902df5824e1f88b530a50
238386,47673.0,711454.5,643500.0,261000.0,0.0,0.0,1.0,0.0,0.0,1.0,...,Sales staff,Self-employed,4.0,0.018850,2,2,456140,1,MONDAY,628902df5824e1f88b530a80
238435,15075.0,270000.0,270000.0,94500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,Cleaning staff,Business Entity Type 2,5.0,0.028663,2,2,456195,0,TUESDAY,628902df5824e1f88b530ab1


## Data Preprocessing

In [92]:
X = df2.drop('TARGET',1)
y = df['TARGET']

In [95]:
X = pd.get_dummies(X)
X.head()

MemoryError: Unable to allocate 53.0 GiB for an array with shape (238490, 238490) and data type uint8

## Exploring Different Methods of Imputation

In [96]:
# using MICE (multiple imputation by chained equation)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=10, verbose=2, imputation_order='roman',random_state=0)
X=imp.fit_transform(X)

ValueError: could not convert string to float: 'M'

In [89]:
# using KNNimputer
from sklearn.impute import KNNImputer
knn = KNNImputer(n_neighbors=2, add_indicator=True)
knn.fit(X)
knn.transform(X)

ValueError: could not convert string to float: 'M'