In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))



In [2]:
directory = "/kaggle/input/home-credit-default-risk"
sample_submission = pd.read_csv(directory + "/sample_submission.csv")
sample_submission

application_train = pd.read_csv(directory + "/application_train.csv")
credit_card_balance = pd.read_csv(directory + "/credit_card_balance.csv")
credit_card_balance.head(5)

application_train.columns
#Credit_Bureau ->credit ratings of individuals

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [3]:
print(f"Shape of training data:{application_train.shape}")

Shape of training data:(307511, 122)


Since we have so many columns, lets try to filter some out.

We can first take a look at which columns have the most NaN values.

In [4]:
def check_nan_columns(df):
    for col in df.columns:
        print(f"Number of NaN values in {col} : {df[col].isnull().sum()}")

In [5]:
df_train = application_train
check_nan_columns(df_train)

Number of NaN values in SK_ID_CURR : 0
Number of NaN values in TARGET : 0
Number of NaN values in NAME_CONTRACT_TYPE : 0
Number of NaN values in CODE_GENDER : 0
Number of NaN values in FLAG_OWN_CAR : 0
Number of NaN values in FLAG_OWN_REALTY : 0
Number of NaN values in CNT_CHILDREN : 0
Number of NaN values in AMT_INCOME_TOTAL : 0
Number of NaN values in AMT_CREDIT : 0
Number of NaN values in AMT_ANNUITY : 12
Number of NaN values in AMT_GOODS_PRICE : 278
Number of NaN values in NAME_TYPE_SUITE : 1292
Number of NaN values in NAME_INCOME_TYPE : 0
Number of NaN values in NAME_EDUCATION_TYPE : 0
Number of NaN values in NAME_FAMILY_STATUS : 0
Number of NaN values in NAME_HOUSING_TYPE : 0
Number of NaN values in REGION_POPULATION_RELATIVE : 0
Number of NaN values in DAYS_BIRTH : 0
Number of NaN values in DAYS_EMPLOYED : 0
Number of NaN values in DAYS_REGISTRATION : 0
Number of NaN values in DAYS_ID_PUBLISH : 0
Number of NaN values in OWN_CAR_AGE : 202929
Number of NaN values in FLAG_MOBIL : 0

In [6]:
columns_to_drop = []
for col in df_train.columns:
    if df_train[col].isnull().sum() > 1000:
        columns_to_drop.append(col)

Lets drop some of the rows where there exist alot of NaN values.

In [7]:
columns_to_drop

['NAME_TYPE_SUITE',
 'OWN_CAR_AGE',
 'OCCUPATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',


In [8]:
df_train.drop(columns_to_drop, axis = 1, inplace=True)

In [9]:
df_train.columns 

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_2',
       'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DO

So now we have successfully dropped the columns with a lot of NaN values. Lets take a look into the columns a little deeper to see which columns may not be useful for our model.

In [10]:
check_nan_columns(df_train)

Number of NaN values in SK_ID_CURR : 0
Number of NaN values in TARGET : 0
Number of NaN values in NAME_CONTRACT_TYPE : 0
Number of NaN values in CODE_GENDER : 0
Number of NaN values in FLAG_OWN_CAR : 0
Number of NaN values in FLAG_OWN_REALTY : 0
Number of NaN values in CNT_CHILDREN : 0
Number of NaN values in AMT_INCOME_TOTAL : 0
Number of NaN values in AMT_CREDIT : 0
Number of NaN values in AMT_ANNUITY : 12
Number of NaN values in AMT_GOODS_PRICE : 278
Number of NaN values in NAME_INCOME_TYPE : 0
Number of NaN values in NAME_EDUCATION_TYPE : 0
Number of NaN values in NAME_FAMILY_STATUS : 0
Number of NaN values in NAME_HOUSING_TYPE : 0
Number of NaN values in REGION_POPULATION_RELATIVE : 0
Number of NaN values in DAYS_BIRTH : 0
Number of NaN values in DAYS_EMPLOYED : 0
Number of NaN values in DAYS_REGISTRATION : 0
Number of NaN values in DAYS_ID_PUBLISH : 0
Number of NaN values in FLAG_MOBIL : 0
Number of NaN values in FLAG_EMP_PHONE : 0
Number of NaN values in FLAG_WORK_PHONE : 0
Numb

EXT_SOURCE_2 contains normalized score from external data source. Looking at the number of NaN values, we can conclude that the non-normalized dataset may also contain similar number of NaN  values. As we do not know what the external data source is, it makes us hard to work with this column. Therefore we should drop it.

We do not need the ID of the client as they do not determine whether the client is able to make the repayment of loans. So lets drop that column as well.

We will also drop columns 'ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE' , 'NAME_FAMILIY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'CODE_GENDER' as it does not represent the ability of client to repay his/her loan. Income values would be more applicable.

In [11]:
df_train.drop(["EXT_SOURCE_2", "SK_ID_CURR", 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'CODE_GENDER'], axis = 1, inplace = True)
df_train.drop(list(df_train.filter(regex='FLAG_DOCUMENT')), axis=1, inplace=True)
df_train

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,DAYS_LAST_PHONE_CHANGE
0,1,Cash loans,N,Y,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,2,2,10,0,0,0,0,0,0,-1134.0
1,0,Cash loans,N,N,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1,1,11,0,0,0,0,0,0,-828.0
2,0,Revolving loans,Y,Y,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,2,2,9,0,0,0,0,0,0,-815.0
3,0,Cash loans,N,Y,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,2,2,17,0,0,0,0,0,0,-617.0
4,0,Cash loans,N,Y,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,2,2,11,0,0,0,0,1,1,-1106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,N,N,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,1,1,15,0,0,0,0,0,0,-273.0
307507,0,Cash loans,N,Y,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,2,2,8,0,0,0,0,0,0,0.0
307508,0,Cash loans,N,Y,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,3,3,9,0,0,0,0,1,1,-1909.0
307509,1,Cash loans,N,Y,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,2,2,9,0,0,0,1,1,0,-322.0


Since we have columns with NaN values, let's try to address them.

For AMT_GOODS_PRICE, it represents price of goods for which the loan is given. With a higher AMT_GOODS_PRICE, it means the consumer has taken a larger loan before to make bigger purchases.  If the value is NaN, it is most likely that the consumer has not made such a loan before, hence we will fill up that column with 0s.

For AMT_ANNUITY, it represents a loan with monthly repayment. If the value is NaN, it is most likely the consumer has no loan with monthly repayment. Therefore, we will fill it up that column with 0s as well.

For CNT_FAM_MEMBERS, it represents the number of family members the client has. Since the client did not declare, we shall assume that he has no family members and we shal also fill it up with 0.

For DAYS_LAST_PHONE_CHANGE, it is rather impossible for someone to not change their phone, especially since phones dont last a lifetime. We shall fill up that column with the average number of days since the last phone change of all the clients.

In [12]:
df_train[['AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS']] = df_train[['AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS']].fillna(value = 0)
days_last_phone_change_mean = df_train['DAYS_LAST_PHONE_CHANGE'].mean(skipna= True)
df_train['DAYS_LAST_PHONE_CHANGE'] = df_train['DAYS_LAST_PHONE_CHANGE'].fillna(value = days_last_phone_change_mean)

In [13]:
check_nan_columns(df_train)

Number of NaN values in TARGET : 0
Number of NaN values in NAME_CONTRACT_TYPE : 0
Number of NaN values in FLAG_OWN_CAR : 0
Number of NaN values in FLAG_OWN_REALTY : 0
Number of NaN values in CNT_CHILDREN : 0
Number of NaN values in AMT_INCOME_TOTAL : 0
Number of NaN values in AMT_CREDIT : 0
Number of NaN values in AMT_ANNUITY : 0
Number of NaN values in AMT_GOODS_PRICE : 0
Number of NaN values in REGION_POPULATION_RELATIVE : 0
Number of NaN values in DAYS_BIRTH : 0
Number of NaN values in DAYS_EMPLOYED : 0
Number of NaN values in DAYS_REGISTRATION : 0
Number of NaN values in DAYS_ID_PUBLISH : 0
Number of NaN values in FLAG_MOBIL : 0
Number of NaN values in FLAG_EMP_PHONE : 0
Number of NaN values in FLAG_WORK_PHONE : 0
Number of NaN values in FLAG_CONT_MOBILE : 0
Number of NaN values in FLAG_PHONE : 0
Number of NaN values in FLAG_EMAIL : 0
Number of NaN values in CNT_FAM_MEMBERS : 0
Number of NaN values in REGION_RATING_CLIENT : 0
Number of NaN values in REGION_RATING_CLIENT_W_CITY : 0


Great! Now we have addressed all the missing values in our dataframe. Before we create our model for deep learning, let us take a look at some of the features we can improve on, as well as the range of values in each column to see if we need to do any normalization. Normalizing our data helps our model to converge faster and provide numerical stability.

In [14]:
# We look at credit of the client with respect to his/her income.
df_train['LOAN_INCOME_RATIO'] = df_train['AMT_CREDIT'] /  df_train['AMT_INCOME_TOTAL']

# It doesnt make much sense to look at the absolute value of loan, so we look at the amount of loan relative to the clients income to get a bigger picture.
df_train['ANNUITY_INCOME_RATIO'] = df_train['AMT_ANNUITY'] / df_train['AMT_INCOME_TOTAL']

df_train.drop(['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL'], inplace=True, axis =1)

In [15]:
for col in df_train.columns:
    print(f"{col}: Max value: {max(df_train[col])}, Min Value {min(df_train[col])}")

TARGET: Max value: 1, Min Value 0
NAME_CONTRACT_TYPE: Max value: Revolving loans, Min Value Cash loans
FLAG_OWN_CAR: Max value: Y, Min Value N
FLAG_OWN_REALTY: Max value: Y, Min Value N
CNT_CHILDREN: Max value: 19, Min Value 0
AMT_GOODS_PRICE: Max value: 4050000.0, Min Value 0.0
REGION_POPULATION_RELATIVE: Max value: 0.072508, Min Value 0.00029
DAYS_BIRTH: Max value: -7489, Min Value -25229
DAYS_EMPLOYED: Max value: 365243, Min Value -17912
DAYS_REGISTRATION: Max value: 0.0, Min Value -24672.0
DAYS_ID_PUBLISH: Max value: 0, Min Value -7197
FLAG_MOBIL: Max value: 1, Min Value 0
FLAG_EMP_PHONE: Max value: 1, Min Value 0
FLAG_WORK_PHONE: Max value: 1, Min Value 0
FLAG_CONT_MOBILE: Max value: 1, Min Value 0
FLAG_PHONE: Max value: 1, Min Value 0
FLAG_EMAIL: Max value: 1, Min Value 0
CNT_FAM_MEMBERS: Max value: 20.0, Min Value 0.0
REGION_RATING_CLIENT: Max value: 3, Min Value 1
REGION_RATING_CLIENT_W_CITY: Max value: 3, Min Value 1
HOUR_APPR_PROCESS_START: Max value: 23, Min Value 0
REG_REGI

In [16]:
df_train_z_scaled = df_train.copy()
columns = ['AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE']
for col in columns:
    df_train_z_scaled[col] = (df_train_z_scaled[col] - df_train_z_scaled[col].mean()) / df_train_z_scaled[col].std()
df_train[columns] = df_train_z_scaled[columns]

for col in df_train.columns:
    print(f"{col}: Max value: {max(df_train[col])}, Min Value {min(df_train[col])}")

TARGET: Max value: 1, Min Value 0
NAME_CONTRACT_TYPE: Max value: Revolving loans, Min Value Cash loans
FLAG_OWN_CAR: Max value: Y, Min Value N
FLAG_OWN_REALTY: Max value: Y, Min Value N
CNT_CHILDREN: Max value: 19, Min Value 0
AMT_GOODS_PRICE: Max value: 9.501541715786653, Min Value -1.4552498947876376
REGION_POPULATION_RELATIVE: Max value: 0.072508, Min Value 0.00029
DAYS_BIRTH: Max value: 1.9587574093532087, Min Value -2.106331090371306
DAYS_EMPLOYED: Max value: 2.1336140055980115, Min Value -0.5784930276291609
DAYS_REGISTRATION: Max value: 1.4153509007282798, Min Value -5.587997419990533
DAYS_ID_PUBLISH: Max value: 1.9836374454922983, Min Value -2.7843230713914195
FLAG_MOBIL: Max value: 1, Min Value 0
FLAG_EMP_PHONE: Max value: 1, Min Value 0
FLAG_WORK_PHONE: Max value: 1, Min Value 0
FLAG_CONT_MOBILE: Max value: 1, Min Value 0
FLAG_PHONE: Max value: 1, Min Value 0
FLAG_EMAIL: Max value: 1, Min Value 0
CNT_FAM_MEMBERS: Max value: 20.0, Min Value 0.0
REGION_RATING_CLIENT: Max value: 

In [17]:
for col in df_train.columns:
    print(f"{col}: {type(df_train[col][0])}")

TARGET: <class 'numpy.int64'>
NAME_CONTRACT_TYPE: <class 'str'>
FLAG_OWN_CAR: <class 'str'>
FLAG_OWN_REALTY: <class 'str'>
CNT_CHILDREN: <class 'numpy.int64'>
AMT_GOODS_PRICE: <class 'numpy.float64'>
REGION_POPULATION_RELATIVE: <class 'numpy.float64'>
DAYS_BIRTH: <class 'numpy.float64'>
DAYS_EMPLOYED: <class 'numpy.float64'>
DAYS_REGISTRATION: <class 'numpy.float64'>
DAYS_ID_PUBLISH: <class 'numpy.float64'>
FLAG_MOBIL: <class 'numpy.int64'>
FLAG_EMP_PHONE: <class 'numpy.int64'>
FLAG_WORK_PHONE: <class 'numpy.int64'>
FLAG_CONT_MOBILE: <class 'numpy.int64'>
FLAG_PHONE: <class 'numpy.int64'>
FLAG_EMAIL: <class 'numpy.int64'>
CNT_FAM_MEMBERS: <class 'numpy.float64'>
REGION_RATING_CLIENT: <class 'numpy.int64'>
REGION_RATING_CLIENT_W_CITY: <class 'numpy.int64'>
HOUR_APPR_PROCESS_START: <class 'numpy.int64'>
REG_REGION_NOT_LIVE_REGION: <class 'numpy.int64'>
REG_REGION_NOT_WORK_REGION: <class 'numpy.int64'>
LIVE_REGION_NOT_WORK_REGION: <class 'numpy.int64'>
REG_CITY_NOT_LIVE_CITY: <class 'nump

Since neural network only accepts numerical values as input. We have to convert the columns that are strings to integers.

For NAME_CONTRACT_TYPE and CODE_GENDER, FLAG_OWN_CAR, FLAG_OWN_REALTY, we shall do one hot encoding.

In [18]:
def one_hot_encoder(col, df, col_name):
    encoder = OneHotEncoder()
    col = col.values.reshape(-1,1)
    col = encoder.fit_transform(col).toarray()
    temp_df = pd.DataFrame(col, columns = encoder.get_feature_names_out([col_name]))
    df = pd.concat([df, temp_df], axis = 1)
    return df

columns_to_encode = ['NAME_CONTRACT_TYPE','FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
for col in columns_to_encode:
    df_train = one_hot_encoder(df_train[col], df_train, col)
    df_train = df_train.drop([col], axis = 1)
    
for col in df_train.columns:
    print(f"{col}: {type(df_train[col][0])}")

TARGET: <class 'numpy.int64'>
CNT_CHILDREN: <class 'numpy.int64'>
AMT_GOODS_PRICE: <class 'numpy.float64'>
REGION_POPULATION_RELATIVE: <class 'numpy.float64'>
DAYS_BIRTH: <class 'numpy.float64'>
DAYS_EMPLOYED: <class 'numpy.float64'>
DAYS_REGISTRATION: <class 'numpy.float64'>
DAYS_ID_PUBLISH: <class 'numpy.float64'>
FLAG_MOBIL: <class 'numpy.int64'>
FLAG_EMP_PHONE: <class 'numpy.int64'>
FLAG_WORK_PHONE: <class 'numpy.int64'>
FLAG_CONT_MOBILE: <class 'numpy.int64'>
FLAG_PHONE: <class 'numpy.int64'>
FLAG_EMAIL: <class 'numpy.int64'>
CNT_FAM_MEMBERS: <class 'numpy.float64'>
REGION_RATING_CLIENT: <class 'numpy.int64'>
REGION_RATING_CLIENT_W_CITY: <class 'numpy.int64'>
HOUR_APPR_PROCESS_START: <class 'numpy.int64'>
REG_REGION_NOT_LIVE_REGION: <class 'numpy.int64'>
REG_REGION_NOT_WORK_REGION: <class 'numpy.int64'>
LIVE_REGION_NOT_WORK_REGION: <class 'numpy.int64'>
REG_CITY_NOT_LIVE_CITY: <class 'numpy.int64'>
REG_CITY_NOT_WORK_CITY: <class 'numpy.int64'>
LIVE_CITY_NOT_WORK_CITY: <class 'nump

Great! We can now begin creating our model for deep learning!

In [19]:
X = df_train.loc[:, df_train.columns!='TARGET']
y = df_train['TARGET']
model = Sequential([
    Dense(units = 25, activation = 'relu'),
    Dense(units = 15, activation = 'relu'),
    Dense(units = 5, activation = 'relu'),
    Dense(units = 1, activation = 'linear') # we only have 1 output unit since this is a classification problem.
])

X = df_train.loc[:, df_train.columns!='TARGET']
y = df_train['TARGET']
# model = Sequential([
#     Dense(units = 25, activation = 'relu'),
#     Dense(units = 15, activation = 'relu'),
#     Dense(units = 5, activation = 'relu'),
#     Dense(units = 1, activation = 'sigmoid') # we only have 1 output unit since this is a classification problem.
# ])

# model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
#               optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002))

# model.fit(X,y, epochs = 20)
# logits = model.predict(X_test)
model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
              optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002))

model.fit(X,y, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7933925432e0>

In [20]:
application_test =  pd.read_csv(directory + "/application_test.csv")
X_test = application_test.copy()
X_test.drop(["EXT_SOURCE_2", "SK_ID_CURR", 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'CODE_GENDER'] + columns_to_drop, axis = 1, inplace = True) # drop the columns that we dropped for the training set
columns_to_encode = ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

X_test['LOAN_INCOME_RATIO'] = X_test['AMT_CREDIT'] /  X_test['AMT_INCOME_TOTAL']
X_test['ANNUITY_INCOME_RATIO'] = X_test['AMT_ANNUITY'] / X_test['AMT_INCOME_TOTAL']
X_test.drop(['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL'], inplace=True, axis=1)

for col in columns_to_encode:
    X_test = one_hot_encoder(X_test[col], X_test, col)
    X_test = X_test.drop([col], axis = 1)

X_test.drop(list(X_test.filter(regex='FLAG_DOCUMENT')), axis=1, inplace=True)

In [21]:
logits = model.predict(X_test)
probabilities = tf.nn.sigmoid(logits)
probabilities
# probabilities = model.predict(X_test)
# print(probabilities)



<tf.Tensor: shape=(48744, 1), dtype=float32, numpy=
array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)>

In [22]:
output = pd.DataFrame({"SK_ID_CURR" : application_test["SK_ID_CURR"], 'TARGET': probabilities.numpy().reshape(-1)})
output.to_csv('submission.csv', index = False)