In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('train_dataset.csv')
df_test = pd.read_csv('public_dataset_without_gt.csv')

# Data Preprocessing

In [3]:
# Merge Dataset
df = pd.concat([df_train, df_test])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67854 entries, 0 to 11536
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   no                             67854 non-null  int64  
 1   APP_date                       67854 non-null  object 
 2   APP_Area                       67854 non-null  object 
 3   APP_Province                   67854 non-null  object 
 4   APP_Shop Name                  67854 non-null  object 
 5   gender                         67854 non-null  object 
 6   date_of_birth_week             67854 non-null  int64  
 7   date_of_birth                  67854 non-null  object 
 8   marital_status                 67854 non-null  int64  
 9   number_of_children             67854 non-null  int64  
 10  postal_code                    67853 non-null  float64
 11  tel_category                   67854 non-null  int64  
 12  number_of_resident             67854 non-null  int6

## Identify

In [4]:
label = 'default_12month' # target to predict

In [5]:
categoricals = ["APP_Area", "APP_Province", "gender", "marital_status",
                "tel_category", "type_of_residence",
                "c_business_type", 'c_position', 'c_occupation','c_employment_status',
                'c_salary_payment_methods', 'media', 'place_for_sending_information',
                'r_propose', 'r_generalcode3', 'apply']

numericals = ["number_of_children", "number_of_resident", "living_period_year",
             "c_number_of_employee", 'c_monthly_salary', 'c_number_of_working_year',
             'r_expected_credit_limit', 'r_allloan_case', 'r_allloan_amount', 'r_additional_income', 'r_spouse_income', 'FICO_Score']

date = ["date_of_birth", "APP_date"]

# Feature engineering

## Transform

### Date

In [6]:
df['APP_date'] = pd.to_datetime(df['APP_date'])
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
current_date = pd.to_datetime('today')

df['applicant_age'] = (current_date - df['date_of_birth']).dt.days
df['application_age'] = (current_date - df['APP_date']).dt.days

df = df.drop(['APP_date'], axis=1)
df = df.drop(['date_of_birth'], axis=1)

### FICO Score
https://www.linkedin.com/pulse/understanding-fico-score-comprehensive-guide-financemagnates/

In [7]:
df['fico_score_category'] = pd.cut(df['FICO_Score'], 
                                   bins=[0, 300, 579, 669, 739, 799, 850], 
                                   labels=['No Credit Info', 'Poor', 'Fair', 'Good', 'Very Good', 'Exceptional'])

fico_mapping = {'No Credit Info': 0, 'Poor': 1, 'Fair': 2, 'Good': 3, 'Very Good': 4, 'Exceptional': 5}
df['fico_score_category_numerical'] = df['fico_score_category'].map(fico_mapping)

df = df.drop(['FICO_Score'], axis=1)

## New

- credit_utilization \
ref. https://www.investopedia.com/terms/c/credit-utilization-rate.asp#toc-how-credit-utilization-impacts-borrowers

In [8]:
# numericals
df['employment_duration'] = df['c_number_of_working_year'] + df['c_number_of_working_month'] / 12
df['residence_duration'] = df['living_period_year'] + df['living_period_month'] / 12
df['income_per_person'] = df['c_monthly_salary'] / (df['number_of_resident'] + 1)
df['total_income'] = df['c_monthly_salary'] + df['r_additional_income'] + df['r_spouse_income']
df['credit_utilization'] = df['r_expected_credit_limit'] / df['total_income']

# df['total_loan_amount'] = df.groupby('no')['r_allloan_amount'].transform('sum')
# df['total_loan_count'] = df.groupby('no')['r_allloan_case'].transform('sum')

# categoricals
df['applicant_age_group'] = pd.cut(df['applicant_age'], 
                                   bins=[0, 25, 45, 65, 100], 
                                   labels=['young', 'middle-aged', 'senior', 'old'])
# df['salary_class'] = pd.cut(df['c_monthly_salary'], 
#                               bins=[0, 50000, 100000, 150000, 200000, np.inf], 
#                               labels=['<50k', '50k-100k', '100k-150k', '150k-200k', '200k+'])
categoricals += ['applicant_age_group']
# categoricals += ['applicant_age_group', 'salary_class']

## Drop

In [9]:
drop_cols = ['no', 'r_generalcode1', 'r_generalcode2',
             'date_of_birth_week', 'c_date_of_salary_payment',
             'postal_code', 'c_postal_code', 'APP_Shop Name',
             "living_period_month", "c_number_of_working_month"]
# drop_cols = ["no", "date_of_birth_week", 
#              "c_date_of_salary_payment", "c_number_of_working_month", "APP_Shop Name", 
#              "living_period_month"]
df = df.drop(drop_cols, axis=1)

## Encoding

In [10]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=categoricals)

# Modeling

In [11]:
# Data Splitting
len_train = len(df_train)

df_train = df[:len_train]
df_test = df[len_train:]

In [12]:
time_limit = 60*60*2

predictor = TabularPredictor(label=label, eval_metric='roc_auc')
predictor.fit(df_train,
              time_limit=time_limit,
              presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20231214_204952"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 7200 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20231214_204952/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20231214_204952/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.13
Operating System:  

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1267cfe80>

In [13]:
predictor.model_best

'WeightedEnsemble_L3'

# Evaluation

In [14]:
predictor.leaderboard(df_train)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini_BAG_L1,1.0,0.606476,roc_auc,0.762546,2.224701,5.113862,0.762546,2.224701,5.113862,1,True,8
1,RandomForestEntr_BAG_L1,1.0,0.622826,roc_auc,0.791641,2.572822,7.591283,0.791641,2.572822,7.591283,1,True,6
2,KNeighborsDist_BAG_L1,1.0,0.517257,roc_auc,1.448216,1.461036,0.02207,1.448216,1.461036,0.02207,1,True,2
3,ExtraTreesEntr_BAG_L1,1.0,0.605404,roc_auc,0.744335,2.303722,5.2678,0.744335,2.303722,5.2678,1,True,9
4,RandomForestGini_BAG_L1,1.0,0.623469,roc_auc,2.816296,2.551339,7.013403,2.816296,2.551339,7.013403,1,True,5
5,RandomForestGini_BAG_L2,0.999825,0.63224,roc_auc,39.831403,16.149542,229.211847,2.613971,2.535881,9.301091,2,True,17
6,ExtraTreesGini_BAG_L2,0.999796,0.634035,roc_auc,37.951101,16.113504,225.97542,0.733669,2.499843,6.064664,2,True,20
7,ExtraTreesEntr_BAG_L2,0.999367,0.636668,roc_auc,37.938596,16.134504,226.38022,0.721164,2.520843,6.469464,2,True,21
8,RandomForestEntr_BAG_L2,0.997379,0.639312,roc_auc,37.773215,16.109741,229.163184,0.555783,2.49608,9.252428,2,True,18
9,WeightedEnsemble_L2,0.99419,0.659141,roc_auc,22.672148,5.40761,167.332371,0.003563,0.00583,6.8805,2,True,14


# Submission

In [15]:
df_test_nolabel = df_test.drop(label, axis=1)
public_dataset = pd.read_csv('public_dataset_without_gt.csv') #for no_column (index)

In [16]:
# ...!brk # break run

## Single Export

In [23]:
# model_name = 'CatBoost_BAG_L1'

# y_pred = predictor.predict_proba(df_test_nolabel, model=model_name)

# result_df = pd.DataFrame({'no': public_dataset['no'], 'default_12month': y_pred[1]})

# output_name = 'output_' + str(time_limit//60)+'min_'+ model_name + '.csv'
# # Export
# result_df.to_csv(output_name, index=False, header=['no', 'default_12month'])

## Multi Export

In [20]:
# models_name = ['WeightedEnsemble_L3', 'WeightedEnsemble_L2',
#                'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2',
#                'CatBoost_BAG_L2', 'CatBoost_r177_BAG_L1', 'CatBoost_BAG_L1'
#                'XGBoost_BAG_L2', 'XGBoost_BAG_L1']

# for model_name in models_name:
    
#     y_pred = predictor.predict_proba(df_test_nolabel, model=model_name)
    
#     result_df = pd.DataFrame({'no': public_dataset['no'], 'default_12month': y_pred[1]})
    
#     output_name = 'output_' + str(time_limit//60)+'min_'+ model_name + '.csv'
#     # Export
#     result_df.to_csv(output_name, index=False, header=['no', 'default_12month'])

NetworkXError: The node XGBoost_BAG_L2 is not in the digraph.

# Feature Importance

In [24]:
feaImp = predictor.feature_importance(df_test, 
                             model='CatBoost_BAG_L1', 
                             time_limit=60*1)

These features in provided data are not utilized by the predictor and will be ignored: ['fico_score_category_numerical', 'total_loan_amount', 'total_loan_count', 'APP_Province_Thaicredit', 'APP_Province_webpak', 'c_business_type_17', 'c_occupation_12', 'c_employment_status_3', 'c_employment_status_4', 'c_salary_payment_methods_3', 'media_10', 'r_generalcode3_0.0', 'r_generalcode3_8.0', 'c_number_of_working_month_12', 'living_period_month_12', 'applicant_age_group_young', 'applicant_age_group_middle-aged', 'applicant_age_group_senior', 'applicant_age_group_old', 'salary_bracket_150k-200k']
Computing feature importance via permutation shuffling for 187 features using 0 rows with 10 shuffle sets... Time limit: 60s...
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.