In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('train_dataset.csv')
df_test = pd.read_csv('public_dataset_without_gt.csv')

# Data Preprocessing

In [3]:
# Merge Dataset
df = pd.concat([df_train, df_test])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67854 entries, 0 to 11536
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   no                             67854 non-null  int64  
 1   APP_date                       67854 non-null  object 
 2   APP_Area                       67854 non-null  object 
 3   APP_Province                   67854 non-null  object 
 4   APP_Shop Name                  67854 non-null  object 
 5   gender                         67854 non-null  object 
 6   date_of_birth_week             67854 non-null  int64  
 7   date_of_birth                  67854 non-null  object 
 8   marital_status                 67854 non-null  int64  
 9   number_of_children             67854 non-null  int64  
 10  postal_code                    67853 non-null  float64
 11  tel_category                   67854 non-null  int64  
 12  number_of_resident             67854 non-null  int6

## Identify

In [4]:
label = 'default_12month' # target to predict

In [5]:
categoricals = ["APP_Area", "APP_Province", "gender", "marital_status",
                "type_of_residence",
                "c_business_type", 'c_position', 'c_occupation','c_employment_status',
                'c_salary_payment_methods', 'media', 'place_for_sending_information',
                'r_propose', 'r_generalcode3', 'apply']

numericals = ["number_of_children", "number_of_resident", "living_period_year",
             "c_number_of_employee", 'c_monthly_salary', 'c_number_of_working_year',
             'r_expected_credit_limit', 'r_allloan_case', 'r_allloan_amount', 'r_additional_income', 'r_spouse_income', 'FICO_Score']

date = ["date_of_birth", "APP_date"]

# Feature engineering

## Transform

### Date

In [6]:
df['APP_date'] = pd.to_datetime(df['APP_date'])
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
current_date = pd.to_datetime('today')

df['applicant_age'] = (current_date - df['date_of_birth']).dt.days
df['application_age'] = (current_date - df['APP_date']).dt.days

df = df.drop(['APP_date'], axis=1)
df = df.drop(['date_of_birth'], axis=1)

### FICO Score
https://www.linkedin.com/pulse/understanding-fico-score-comprehensive-guide-financemagnates/

In [7]:
df['fico_score_category'] = pd.cut(df['FICO_Score'], 
                                   bins=[0, 300, 579, 669, 739, 799, 850], 
                                   labels=['No Credit Info', 'Poor', 'Fair', 'Good', 'Very Good', 'Exceptional'])

fico_mapping = {'No Credit Info': 0, 'Poor': 1, 'Fair': 2, 'Good': 3, 'Very Good': 4, 'Exceptional': 5}
df['fico_score_category_numerical'] = df['fico_score_category'].map(fico_mapping)

df = df.drop(['FICO_Score'], axis=1)

## New

- credit_utilization \
ref. https://www.investopedia.com/terms/c/credit-utilization-rate.asp#toc-how-credit-utilization-impacts-borrowers

In [8]:
# numericals
df['employment_duration'] = df['c_number_of_working_year'] + df['c_number_of_working_month'] / 12
df['residence_duration'] = df['living_period_year'] + df['living_period_month'] / 12
df['income_per_person'] = df['c_monthly_salary'] / (df['number_of_resident'] + 1)
df['total_income'] = df['c_monthly_salary'] + df['r_additional_income'] + df['r_spouse_income']
df['credit_utilization'] = df['r_expected_credit_limit'] / df['total_income']
df['income_to_creditlimit'] = df['c_monthly_salary'] / df['r_expected_credit_limit']

# categoricals
df['applicant_age_group'] = pd.cut(df['applicant_age'], 
                                   bins=[0, 25, 45, 65, 100], 
                                   labels=['young', 'middle-aged', 'senior', 'old'])

categoricals += ['applicant_age_group']

In [9]:
df = df.drop(['c_monthly_salary','r_additional_income','r_spouse_income'], axis=1)

## Drop

In [10]:
drop_cols = ['no', 'r_generalcode1', 'r_generalcode2',
             'date_of_birth_week', 'c_date_of_salary_payment',
             'postal_code', 'c_postal_code', 'APP_Shop Name',
             "c_number_of_working_month", "living_period_month","tel_category"]
df = df.drop(drop_cols, axis=1)

## Encoding

In [11]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=categoricals)

# Modeling

In [12]:
# Data Splitting
len_train = len(df_train)

df_train = df[:len_train]
df_test = df[len_train:]

In [15]:
time_limit = 60*15

predictor = TabularPredictor(label=label, eval_metric='roc_auc')
predictor.fit(df_train,
              time_limit=time_limit,
              presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20231215_043118"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 900 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20231215_043118/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 225s
AutoGluon will save models to "AutogluonModels/ag-20231215_043118/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.13
Operating System:   D

[1000]	valid_set's binary_logloss: 0.369483
[1000]	valid_set's binary_logloss: 0.371527
[1000]	valid_set's binary_logloss: 0.367541
[1000]	valid_set's binary_logloss: 0.368909
[1000]	valid_set's binary_logloss: 0.370224
[1000]	valid_set's binary_logloss: 0.367149
[1000]	valid_set's binary_logloss: 0.368841


	0.6526	 = Validation score   (roc_auc)
	43.72s	 = Training   runtime
	0.66s	 = Validation runtime
Fitting model: NeuralNetTorch_r22_BAG_L1 ... Training model for up to 28.33s of the 249.47s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
  adjusted = values - mean
		Input X contains infinity or a value too large for dtype('float64').
Detailed Traceback:
Traceback (most recent call last):
  File "/Users/pupipatsingkhorn/miniconda3/envs/autogluon/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1817, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/Users/pupipatsingkhorn/miniconda3/envs/autogluon/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1763, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
  Fil

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x17dfd84f0>

In [None]:
predictor.model_best

'WeightedEnsemble_L2'

# Evaluation

In [None]:
predictor.leaderboard(df_train)

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr_BAG_L1,1.0,0.599117,roc_auc,0.51258,1.244007,2.790808,0.51258,1.244007,2.790808,1,True,9
1,KNeighborsDist_BAG_L1,1.0,0.522642,roc_auc,0.538638,0.673441,0.058111,0.538638,0.673441,0.058111,1,True,2
2,RandomForestEntr_BAG_L1,1.0,0.610846,roc_auc,0.595241,1.676784,4.720774,0.595241,1.676784,4.720774,1,True,6
3,RandomForestGini_BAG_L1,1.0,0.611409,roc_auc,0.71067,1.636994,4.951693,0.71067,1.636994,4.951693,1,True,5
4,ExtraTreesGini_BAG_L1,1.0,0.598495,roc_auc,0.778111,1.449417,3.013999,0.778111,1.449417,3.013999,1,True,8
5,WeightedEnsemble_L2,0.986121,0.658284,roc_auc,17.445586,6.602455,869.389773,0.005929,0.005234,7.131934,2,True,14
6,LightGBMLarge_BAG_L1,0.8911,0.645725,roc_auc,0.901451,0.146131,15.289297,0.901451,0.146131,15.289297,1,True,13
7,KNeighborsUnif_BAG_L1,0.846093,0.521566,roc_auc,0.543591,0.635107,0.053948,0.543591,0.635107,0.053948,1,True,1
8,XGBoost_BAG_L1,0.793673,0.649393,roc_auc,1.610055,0.31556,28.119481,1.610055,0.31556,28.119481,1,True,11
9,LightGBM_BAG_L1,0.770672,0.648829,roc_auc,0.866965,0.118988,8.266713,0.866965,0.118988,8.266713,1,True,4


# Submission

In [16]:
df_test_nolabel = df_test.drop(label, axis=1)
public_dataset = pd.read_csv('public_dataset_without_gt.csv') #for no_column (index)

In [None]:
# ...!brk # break run

## Single Export

In [None]:
# model_name = 'CatBoost_BAG_L1'

# y_pred = predictor.predict_proba(df_test_nolabel, model=model_name)

# result_df = pd.DataFrame({'no': public_dataset['no'], 'default_12month': y_pred[1]})

# output_name = 'output_' + str(time_limit//60)+'min_'+ model_name + '.csv'
# # Export
# result_df.to_csv(output_name, index=False, header=['no', 'default_12month'])

## Multi Export

In [17]:
# models_name = ['WeightedEnsemble_L3', 'WeightedEnsemble_L2',
#                'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2',
#                'CatBoost_BAG_L2', 'CatBoost_r177_BAG_L1', 'CatBoost_BAG_L1'
#                'XGBoost_BAG_L2', 'XGBoost_BAG_L1']
models_name = predictor.model_names()

for model_name in models_name:
    
    y_pred = predictor.predict_proba(df_test_nolabel, model=model_name)
    
    result_df = pd.DataFrame({'no': public_dataset['no'], 'default_12month': y_pred[1]})
    
    output_name = 'output_' + str(time_limit//60)+'min_'+ model_name + '.csv'
    # Export
    result_df.to_csv(output_name, index=False, header=['no', 'default_12month'])

# Feature Importance

In [None]:
# feaImp = predictor.feature_importance(df_test, 
#                              model='CatBoost_BAG_L1', 
#                              time_limit=60*1)