<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Home_Credit_Default_Risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ADD KEYS

In [None]:
!rm -rf *

import os
os.environ['KAGGLE_USERNAME'] = "robimalco"
os.environ['KAGGLE_KEY'] = ""
!pip install -q kaggle
!kaggle competitions download -c home-credit-default-risk

# START SETUP

In [None]:
!unzip application_test.csv.zip
!unzip application_train.csv.zip
!unzip previous_application.csv.zip
# !unzip POS_CASH_balance.csv.zip
# !unzip bureau.csv.zip
# !unzip bureau_balance.csv.zip
# !unzip credit_card_balance.csv.zip
# !unzip installments_payments.csv.zip

In [None]:
!pip install torch==1.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

import numpy as np
import pandas as pd
import gc

import datetime
import random
import string

import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from matplotlib import pyplot as plt

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
  print(torch.cuda.get_device_name(0))

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('mode.chained_assignment', None)

In [None]:
!pip install --upgrade gspread
from google.colab import auth, drive, files
import gspread
from oauth2client.client import GoogleCredentials
drive.mount('/drive')
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())
wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1M-CqsTbBu7ScY4mZBcpI8kPbI7F-BE8aPTC4UknumYk/edit#gid=0')
sheet = wb.worksheet('Data')
def get_next_row(worksheet):
    str_list = list(filter(None, worksheet.col_values(1)))
    return str(len(str_list)+1)

# UTILITIES

In [7]:
def generate_timestamp():
  now = datetime.datetime.now()
  year = '{:02d}'.format(now.year)
  month = '{:02d}'.format(now.month)
  day = '{:02d}'.format(now.day)
  hour = '{:02d}'.format(now.hour)
  minute = '{:02d}'.format(now.minute)
  return '{}-{}-{} {}:{}'.format(year, month, day, hour, minute)

In [8]:
def order_columns_alphabetically(input_df):
  input_df_columns = list(input_df.columns)
  input_df_columns = sorted(input_df_columns, key=str.casefold)
  return input_df[input_df_columns]

In [9]:
def smart_overview(input_df):
  a_types = []
  a_countUnique = []
  a_missing = []
  a_missing_perc = []
  a_corrTarget = []
  a_min = []
  a_max = []
  a_mean = []
  a_median = []
  a_quantile = []
  for column in input_df.columns:
    x = input_df[column]
    x_type = input_df.dtypes[column]
    countUnique = len(x.unique())
    missing = x.isnull().sum()
    missing_perc = round((missing/input_df.shape[0]),3)*100
    if x_type == np.int64 or x_type == np.float64:
      if 'TARGET' in input_df.columns:
        a_corrTarget.append(round(x.corr(input_df['TARGET']), 3))
      else:
        a_corrTarget.append('/')
      a_min.append(x.min())
      a_max.append(x.max())
      a_mean.append(x.mean())
      a_median.append(x.median())
      a_quantile.append(x.quantile(0.5))
    else:
      a_corrTarget.append('')
      a_min.append('')
      a_max.append('')
      a_mean.append('')
      a_median.append('')
      a_quantile.append('')
    a_types.append(x_type)
    a_countUnique.append(countUnique)
    a_missing.append(missing)
    a_missing_perc.append(missing_perc)
  explore_df = pd.DataFrame({
    'Columns': input_df.columns,
    'Types': a_types,
    'Unique': a_countUnique,
    'Missing': a_missing,
    'Missing%': a_missing_perc,
    'CorrTarget': a_corrTarget,
    'Min': a_min,
    'Max': a_max,
    'Mean': a_mean,
    'Median': a_median,
    'Quantile': a_quantile
  })
  explore_df.set_index('Columns', inplace=True)
  return order_columns_alphabetically(explore_df.transpose())

# Notes

In [None]:
# application_{train|test}.csv --> main table, static data for all applications. One row represents one loan in our data sample.
# bureau.csv --> client's previous credits, for every loan in our sample, there are as many rows as number of credits the client had.
# bureau_balance.csv --> monthly balances of previous credits, one row for each month.
# POS_CASH_balance.csv --> monthly balance snapshots of previous point of sales and cash loans that the applicant had, one row for each month.
# credit_card_balance.csv --> monthly balance snapshots of previous credit cards, one row for each month.
# previous_application.csv --> all previous applications for Home Credit loans of clients who have loans.
# installments_payments.csv --> repayment history for the previously disbursed credits.

columns_descriptions_df = pd.read_csv('HomeCredit_columns_description.csv', engine='python')
# columns_descriptions_df[columns_descriptions_df['Table'] == 'previous_application.csv'].sort_values(by=['Row'])

- The number of hidden neurons should be between the size of the input layer and the size of the output layer.					
- The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.					
- The number of hidden neurons should be less than twice the size of the input layer.
- I = 2000
- H = 1321
- O = 1

# SET HYPERPARAMETERS

In [10]:
hp_test_size = 0.2
hp_epochs = 20
hr_batch_size = 320
hp_lr= 0.000005
hp_emb_drop = 0.04
hp_layers = [800, 350]
hp_ps = [0.001,0.01]

# LOAD DATA

In [11]:
application_train_df = pd.read_csv('application_train.csv').sample(frac = 1)
application_test_df = pd.read_csv('application_test.csv')
previous_application_df = pd.read_csv('previous_application.csv')
# bureau_df = pd.read_csv('bureau.csv')
# bureau_balance_df = pd.read_csv('bureau_balance.csv')
# pos_cash_balance_df = pd.read_csv('POS_CASH_balance.csv')
# credit_card_balance_df = pd.read_csv('credit_card_balance.csv')
# installments_payments_df = pd.read_csv('installments_payments.csv')

In [12]:
application_train_df['CSV_SOURCE'] = 'application_train.csv'
application_test_df['CSV_SOURCE'] = 'application_test.csv'
df = pd.concat([application_train_df, application_test_df])

In [None]:
correlations = df[['EXT_SOURCE_1','EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']].corr()['TARGET'].sort_values()
correlations

# MANAGE previous_application.csv¶

In [14]:
temp_previous_df = previous_application_df.groupby('SK_ID_CURR', as_index=False).agg({'NAME_CONTRACT_STATUS': lambda x: ','.join(set(','.join(x).split(',')))})
temp_previous_df['has_only_approved'] = np.where(temp_previous_df['NAME_CONTRACT_STATUS'] == 'Approved', '1', '0')
temp_previous_df['has_been_rejected'] = np.where(temp_previous_df['NAME_CONTRACT_STATUS'].str.contains('Refused'), '1', '0')

# JOIN DATA

In [15]:
df = pd.merge(df, temp_previous_df, on='SK_ID_CURR', how='left')

# CREATE CUSTOM COLUMNS

In [16]:
#################################################### total_amt_req_credit_bureau
df['total_amt_req_credit_bureau'] = (
  df['AMT_REQ_CREDIT_BUREAU_YEAR'] * 1 + 
  df['AMT_REQ_CREDIT_BUREAU_QRT'] * 2 + 
  df['AMT_REQ_CREDIT_BUREAU_MON'] * 8 + 
  df['AMT_REQ_CREDIT_BUREAU_WEEK'] * 16 + 
  df['AMT_REQ_CREDIT_BUREAU_DAY'] * 32 +
  df['AMT_REQ_CREDIT_BUREAU_HOUR'] * 64)

#######################################################################  has_job
df['has_job'] = np.where(df['NAME_INCOME_TYPE'].isin(['Pensioner', 'Student', 'Unemployed']), '1', '0')

#######################################################################  has_children
df['has_children'] = np.where(df['CNT_CHILDREN'] > 0, '1', '0')

####################################################### clusterise_days_employed
def clusterise_days_employed(x):
    days = x['DAYS_EMPLOYED']
    if days > 0:
      return 'not available'
    else:
      days = abs(days)
      if days < 30:
        return 'less 1 month'
      elif days < 180:
        return 'less 6 months'
      elif days < 365:
        return 'less 1 year'
      elif days < 1095:
        return 'less 3 years'
      elif days < 1825:
        return 'less 5 years'
      elif days < 3600:
        return 'less 10 years'
      elif days < 7200:
        return 'less 20 years'
      elif days >= 7200:
        return 'more 20 years'
      else:
        return 'not available'
df['cluster_days_employed'] = df.apply(clusterise_days_employed, axis=1)

#######################################################################  custom_ext_source_3
def clusterise_ext_source(x):
    if str(x) == 'nan':
      return 'not available'
    else:
      if x < 0.1:
        return 'less 0.1'
      elif x < 0.2:
        return 'less 0.2'
      elif x < 0.3:
        return 'less 0.3'
      elif x < 0.4:
        return 'less 0.4'
      elif x < 0.5:
        return 'less 0.5'
      elif x < 0.6:
        return 'less 0.6'
      elif x < 0.7:
        return 'less 0.7'
      elif x < 0.8:
        return 'less 0.8'
      elif x < 0.9:
        return 'less 0.9'
      elif x <= 1:
        return 'less 1'
df['clusterise_ext_source_1'] = df['EXT_SOURCE_1'].apply(lambda x: clusterise_ext_source(x))
df['clusterise_ext_source_2'] = df['EXT_SOURCE_2'].apply(lambda x: clusterise_ext_source(x))
df['clusterise_ext_source_3'] = df['EXT_SOURCE_3'].apply(lambda x: clusterise_ext_source(x))

# MANAGE COLUMNS (NUMERICAL VS CATEGORICAL)

In [17]:
"""
numerical_columns = [
  'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'AMT_INCOME_TOTAL',
  'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION',
  'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'DAYS_EMPLOYED', 'DAYS_LAST_PHONE_CHANGE',
  'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
  'TOTAL_AMT_REQ_CREDIT_BUREAU_YEAR']
categorical_columns = [
  'CODE_GENDER', 'CSV_SOURCE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
  'FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'LIVE_CITY_NOT_WORK_CITY',
  'LIVE_REGION_NOT_WORK_REGION', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE',
  'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE',
  'REGION_RATING_CLIENT_W_CITY', 'has_only_approved', 'has_been_rejected']

  #########

numerical_columns = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH'] 
categorical_columns = [
  'CODE_GENDER', 'CSV_SOURCE', 'FLAG_OWN_CAR', 'NAME_EDUCATION_TYPE',
  'FLAG_OWN_REALTY', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE',
  'has_only_approved', 'has_been_rejected', 'has_job', 'cluster_days_employed']

"""

numerical_columns = [
  'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH']
categorical_columns = [
  'CODE_GENDER', 'CSV_SOURCE', 'FLAG_OWN_CAR', 'NAME_EDUCATION_TYPE', 'FLAG_OWN_REALTY', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE',
  'has_only_approved', 'has_been_rejected', 'has_job', 'has_children', 'cluster_days_employed',
  'clusterise_ext_source_1', 'clusterise_ext_source_2', 'clusterise_ext_source_3']


target_column = ['TARGET']
df = df[numerical_columns + categorical_columns + target_column]

In [None]:
smart_overview(df)

# MANAGE MISSING VALUES

In [18]:
for numerical_column in numerical_columns:
  if df[numerical_column].isnull().values.any():
    df[numerical_column + '_isnull'] = np.where(df[numerical_column].isnull(), '1', '0')
  df[numerical_column].fillna(value=df[numerical_column].median(), inplace=True)

for categorical_column in categorical_columns:
  df[categorical_column].fillna('NULL', inplace=True)

# STANDARDISE

In [19]:
min_max_scaler = preprocessing.MinMaxScaler()
df[numerical_columns] = pd.DataFrame(min_max_scaler.fit_transform(df[numerical_columns]))

# CONVERT CATEGORICAL COLUMNS INTO TYPE "CATEGORY"

In [20]:
categorical_columns.remove('CSV_SOURCE')

for column in categorical_columns:
  df[column] = LabelEncoder().fit_transform(df[column].astype(str))
  df[column] = df[column].astype('category')

In [None]:
smart_overview(df)

# SPLIT DATA INTO TRAINING vs TRAIN

In [21]:
train_df = df[df['CSV_SOURCE'] == 'application_train.csv']
train_output_df = pd.DataFrame(train_df['TARGET'], columns=['TARGET'])

test_df = df[df['CSV_SOURCE'] == 'application_test.csv']

# REMOVE NOT USEFUL COLUMNS

In [22]:
train_df.drop(columns=['CSV_SOURCE', 'TARGET'], axis=0, inplace=True)
test_df.drop(columns=['CSV_SOURCE', 'TARGET'], axis=0, inplace=True)

# CREATE VALIDATION SET

In [23]:
x_train, x_validation, y_train, y_validation = train_test_split(train_df, train_output_df, test_size=hp_test_size, random_state=42)

# CREATE TENSORS

In [24]:
def create_tensors(input_df):
  stack = []
  for column in input_df.columns:
    if input_df.dtypes[column] == np.int64 or input_df.dtypes[column] == np.float64:
      stack.append(input_df[column].astype(np.float64))
    else:
      stack.append(input_df[column].cat.codes.values)
  return torch.tensor(np.stack(stack, 1), dtype=torch.float)

tensor_x_train_cat = create_tensors(x_train[categorical_columns]).float().to(device)
tensor_x_train_num = create_tensors(x_train[numerical_columns]).float().to(device)
tensor_y_train = torch.tensor(y_train.values).flatten().float().to(device)

tensor_x_valid_cat = create_tensors(x_validation[categorical_columns]).float().to(device)
tensor_x_valid_num = create_tensors(x_validation[numerical_columns]).float().to(device)
tensor_y_valid = torch.tensor(y_validation.values).flatten().float().to(device)

tensor_x_test_cat = create_tensors(test_df[categorical_columns]).float().to(device)
tensor_x_test_num = create_tensors(test_df[numerical_columns]).float().to(device)

# CREATE CATEGORICAL EMBEDDING SIZES

In [25]:
categorical_columns_size = [len(df[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size + 1) // 2)) for col_size in categorical_columns_size]

# DEFINE NEURAL NETWORK MODEL

![](https://yashuseth.files.wordpress.com/2018/07/model1.png)


In [26]:
class Model(nn.Module):
  def __init__(self, embedding_size, input_size, num_numerical_cols, layers, ps):
    super().__init__()

    self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
    self.emb_drop = nn.Dropout(hp_emb_drop)

    self.bn_cont = nn.BatchNorm1d(num_numerical_cols)

    layerlist = []
    for i, elem in enumerate(layers):
      layerlist.append(nn.Linear(input_size, elem))
      layerlist.append(nn.ReLU(inplace=True))
      layerlist.append(nn.BatchNorm1d(layers[i]))
      layerlist.append(nn.Dropout(ps[i]))
      input_size = elem
    layerlist.append(nn.Linear(layers[-1], 1))

    self.layers = nn.Sequential(*layerlist)

  def forward(self, x_c, x_n):

    embeddings = [e(x_c[:,i].long()) for i, e in enumerate(self.all_embeddings)]

    x = torch.cat(embeddings, 1)
    x = self.emb_drop(x)

    x_n = self.bn_cont(x_n)

    x = torch.cat([x, x_n], 1)
    x = self.layers(x)

    return x

# INSTANTIATE NEURAL NETWORK MODEL

In [27]:
try: del model, loss_function, optimizer
except NameError: pass

In [28]:
num_numerical_cols = tensor_x_train_num.shape[1]

num_categorical_cols = sum((nf for ni, nf in categorical_embedding_sizes))
initial_input_size = num_categorical_cols + num_numerical_cols

model = Model(categorical_embedding_sizes, initial_input_size, num_numerical_cols, layers=hp_layers, ps=hp_ps)
sigmoid = nn.Sigmoid()
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=hp_lr)
model.to(device)
tot_losses = []
tot_auc = []

# TRAIN NEURAL NETWORK MODEL

In [None]:
train_tensor_dataset = TensorDataset(tensor_x_train_cat, tensor_x_train_num, tensor_y_train)
train_loader = DataLoader(dataset=train_tensor_dataset, batch_size=hr_batch_size, shuffle=True)

start_training = generate_timestamp()

model.train()

tot_y_train_in = []
tot_y_train_out = []

for epoch in range(hp_epochs):
  train_losses = []
  for x_cat, x_num, y in train_loader:
    y_train = model(x_cat, x_num)
    single_loss = loss_function(sigmoid(y_train.squeeze()), y)
    single_loss.backward() 
    optimizer.step()

    train_losses.append(single_loss.item())
    tot_y_train_in.append(y)
    tot_y_train_out.append(y_train)
  epoch_loss = 1.0 * sum(train_losses) / len(train_losses)
  tot_losses.append(epoch_loss)
  epoch_auc = roc_auc_score(torch.cat(tot_y_train_in).cpu().numpy(), torch.cat(tot_y_train_out).cpu().detach().numpy())
  tot_auc.append(epoch_auc)
  tot_y_train_in = []
  tot_y_train_out = []
  print("epoch: " + str(epoch) + "\tloss: " + str(epoch_loss) + "\tauc: " + str(epoch_auc))

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
axes[0].plot(tot_losses)
axes[1].plot(tot_auc)
fig.tight_layout()

last_train_loss = epoch_loss
end_training = generate_timestamp()

# VALIDATE NEURAL NETWORK

In [None]:
validation_tensor_dataset = TensorDataset(tensor_x_valid_cat, tensor_x_valid_num, tensor_y_valid)
validation_loader = DataLoader(dataset=validation_tensor_dataset, batch_size=hr_batch_size, shuffle=True)

valid_losses = []

model.eval()

tot_y_valid_in = []
tot_y_valid_out = []

with torch.no_grad():
  for x_cat, x_num, y in validation_loader:
    y_valid = model(x_cat, x_num)
    validation_loss = loss_function(sigmoid(y_valid.squeeze()), y)
    valid_losses.append(validation_loss.item())

    tot_y_valid_in.append(y_valid)
    tot_y_valid_out.append(y)

  valid_loss = round(1.0 * sum(valid_losses) / len(valid_losses), 5)
  print("loss: " + str(valid_loss))
  valid_auc = roc_auc_score(torch.cat(tot_y_valid_out).cpu(), torch.cat(tot_y_valid_in).cpu())
  print("auc: " + str(valid_auc))

# MAKE PREDICTIONS

In [32]:
with torch.no_grad():
  y_test = model(tensor_x_test_cat, tensor_x_test_num)

# VERIFIY PREDICTION DISTRIBUTION

In [None]:
nn_prediction_df = pd.DataFrame(y_test).astype("float")
x_scaled = min_max_scaler.fit_transform(nn_prediction_df)
nn_prediction_df = pd.DataFrame(x_scaled)
nn_prediction_df = pd.concat([nn_prediction_df, application_test_df['SK_ID_CURR']], axis=1)
nn_prediction_df.columns = ['TEMP_TARGET', 'SK_ID_CURR']
nn_prediction_df['TARGET'] = nn_prediction_df['TEMP_TARGET']
nn_prediction_df = nn_prediction_df[['SK_ID_CURR', 'TARGET']]
nn_prediction_df['TARGET'].hist(bins=1000)

In [None]:
temp_prediction_df = nn_prediction_df.copy()
temp_prediction_df['TARGET'] = round(temp_prediction_df['TARGET'], 1) 
test_target_mean = str(round(temp_prediction_df['TARGET'].mean(), 3))
test_distrbution = temp_prediction_df.groupby(by=['TARGET'])['TARGET'].count()
print("test_target_mean:", test_target_mean)
print(test_distrbution)

# SAVE PREDICTIONS TO CSV

In [35]:
nn_prediction_id = ''.join(random.choice(string.ascii_uppercase + string.digits) for i in range(8))
nn_prediction_df.to_csv('/drive/My Drive/Notebooks/kaggle/HomeCreditDefaultRisk/submissions/' + nn_prediction_id + '.csv', index=False)

# SAVE DATA TO SHEET



In [None]:
model_values_dict = {
  'ID': nn_prediction_id,
  'start_training': start_training,
  'end_training': end_training,
  'perc_test_size': hp_test_size,
  'emb_drop': hp_emb_drop,
  'layers': '\n'.join([str(i) for i in hp_layers]),
  'ps': '\n'.join([str(i) for i in hp_ps]),
  'lr': hp_lr,
  'epochs': hp_epochs,
  'batch_size': hr_batch_size,
  'train_losses': '\n'.join([str(round(i, 5)) for i in tot_losses]),
  'last_train_loss': last_train_loss,
  'valid_loss': valid_loss,
  'Δloss%': str(round((valid_loss / epoch_loss - 1) * 100, 3)) + '%',
  'test_target_mean': test_target_mean,
  'test_distrbution': test_distrbution.to_string(header=False),
  'numerical_columns': len(numerical_columns),
  'categorical_columns': len(categorical_columns),
  'model_parameters': str(model.parameters)
}

next_row = get_next_row(sheet)
cells = sheet.range('A' + next_row + ':S' + next_row)
model_values_list = list(model_values_dict.values())
for i, cell in enumerate(cells):
  cell.value = model_values_list[i]
sheet.update_cells(cells)

# XGBOOST

In [None]:
"""
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier

ratio = (train_output_df == 0).sum() / (train_output_df == 1).sum()[0]
"""

In [None]:
"""
lbl = preprocessing.LabelEncoder()
train_df['CODE_GENDER'] = lbl.fit_transform(train_df['CODE_GENDER'].astype(str))
train_df['FLAG_OWN_CAR'] = lbl.fit_transform(train_df['FLAG_OWN_CAR'].astype(str))
train_df['NAME_EDUCATION_TYPE'] = lbl.fit_transform(train_df['NAME_EDUCATION_TYPE'].astype(str))
train_df['FLAG_OWN_REALTY'] = lbl.fit_transform(train_df['FLAG_OWN_REALTY'].astype(str))
train_df['OCCUPATION_TYPE'] = lbl.fit_transform(train_df['OCCUPATION_TYPE'].astype(str))
train_df['ORGANIZATION_TYPE'] = lbl.fit_transform(train_df['ORGANIZATION_TYPE'].astype(str))
train_df['has_only_approved'] = lbl.fit_transform(train_df['has_only_approved'].astype(str))
train_df['has_been_rejected'] = lbl.fit_transform(train_df['has_been_rejected'].astype(str))
train_df['has_job'] = lbl.fit_transform(train_df['has_job'].astype(str))
train_df['cluster_days_employed'] = lbl.fit_transform(train_df['cluster_days_employed'].astype(str))
"""

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(train_df, train_output_df, test_size=0.2, stratify=train_output_df, random_state=1)

In [None]:
"""
clf = XGBClassifier(n_estimators=1000, objective='binary:logistic', gamma=0.1, subsample=0.5, scale_pos_weight=ratio[0])
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='auc', early_stopping_rounds=10)
"""

In [None]:
"""
test_df['CODE_GENDER'] = lbl.fit_transform(test_df['CODE_GENDER'].astype(str))
test_df['FLAG_OWN_CAR'] = lbl.fit_transform(test_df['FLAG_OWN_CAR'].astype(str))
test_df['NAME_EDUCATION_TYPE'] = lbl.fit_transform(test_df['NAME_EDUCATION_TYPE'].astype(str))
test_df['FLAG_OWN_REALTY'] = lbl.fit_transform(test_df['FLAG_OWN_REALTY'].astype(str))
test_df['OCCUPATION_TYPE'] = lbl.fit_transform(test_df['OCCUPATION_TYPE'].astype(str))
test_df['ORGANIZATION_TYPE'] = lbl.fit_transform(test_df['ORGANIZATION_TYPE'].astype(str))
test_df['has_only_approved'] = lbl.fit_transform(test_df['has_only_approved'].astype(str))
test_df['has_been_rejected'] = lbl.fit_transform(test_df['has_been_rejected'].astype(str))
test_df['has_job'] = lbl.fit_transform(test_df['has_job'].astype(str))
test_df['cluster_days_employed'] = lbl.fit_transform(test_df['cluster_days_employed'].astype(str))
"""

In [None]:
"""
xgboost_prediction_df = clf.predict_proba(test_df)[:, 1]
xgboost_prediction_df = pd.DataFrame({'SK_ID_CURR': application_test_df['SK_ID_CURR'].values, 'TARGET': xgboost_prediction_df})
"""

# ENSEMBLE

In [None]:
"""
ensemble_df = pd.merge(submission_df, nn_prediction_id, on='SK_ID_CURR', how='left')
ensemble_df['TARGET'] = (ensemble_df['TARGET_x'] + ensemble_df['TARGET_y']) / 2
ensemble_df = ensemble_df[['SK_ID_CURR', 'TARGET']]
"""

# DOWNLOAD RESULTS

In [None]:
# namestr(df, globals())[0]
# nn_prediction_df
# xgboost_prediction_df
# ensemble_df
# ensemble_df.to_csv('submission.csv', index=False)
# files.download('submission.csv')