In [1]:
import os
import shutil

while not os.path.isfile("README.md"):
    %cd ..

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import compute_sample_weight
import lightgbm as lgb
from lib.preprocess import get_data

pd.set_option('display.max_colwidth', None)

/home/nakagawa/fdua_finance_competition


In [2]:
FILE_NAME = "lgbm_baseline"
OFFICIAL_DATA_DIR = "data/official"

In [3]:
X_train, y_train, X_test = get_data(OFFICIAL_DATA_DIR)

In [5]:
X_train.head(5)

Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural
0,163,21,1.0,0,0,1,N,N,31-Jan-98,0,22-Sep-06,2006,PHOENIX,AZ,SD,"$80,000.00","$80,000.00","$68,000.00",0
1,84,6,1.0,4,0,0,0,N,31-Oct-93,62,30-Jun-92,1992,MCALESTER,OK,OK,"$287,000.00","$287,000.00","$229,600.00",0
2,242,45,1.0,4,90,0,N,N,31-Aug-01,42,18-Apr-01,2001,HAWTHORNE,NJ,NJ,"$31,983.00","$30,000.00","$15,000.00",1
3,237,4,1.0,0,0,0,N,N,31-Aug-07,33,6-Oct-03,2004,NASHVILLE,TN,SD,"$229,000.00","$229,000.00","$229,000.00",0
4,184,0,1.0,0,0,0,N,N,8-Jun-83,0,17-Dec-99,2000,POMONA,CA,CA,"$525,000.00","$525,000.00","$393,750.00",0


In [6]:
object_columns = X_train.select_dtypes(include=["object"]).columns # get all categorical feature columns
date_columns = ["ApprovalDate", "DisbursementDate"]
money_columns = ["DisbursementGross", "GrAppv", "SBA_Appv"]
categorical_columns = ['RevLineCr', 'LowDoc', 'City', 'State', 'BankState']

In [7]:
day_map = lambda s: float(s)
month_map = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}
year_map = lambda s: float("20"+s) if float(s) < 50 else float("19"+s) if float(s)>50 else s
datemaps_dict = {
    "day": day_map,
    "month": month_map,
    "year": year_map,
}

In [8]:
for datecol in date_columns:
    for i, (key, mapfunc) in enumerate(datemaps_dict.items()):
        X_train[datecol + "_" + key] = X_train[datecol].str.split("-").str[i].map(mapfunc)
        X_train[datecol + "_" + key] = X_train[datecol + "_" + key].astype(float)
        X_test[datecol + "_" + key] = X_test[datecol].str.split("-").str[i].map(mapfunc)
        X_test[datecol + "_" + key] = X_test[datecol + "_" + key].astype(float)

In [9]:
X_train[money_columns]= X_train[money_columns].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float)
X_test[money_columns]= X_test[money_columns].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float)

In [10]:
y_train.value_counts(normalize=True)

MIS_Status
1    0.892689
0    0.107311
Name: proportion, dtype: float64

In [11]:
def mean_f1score(preds:np.ndarray,eval_data: lgb.Dataset):
    y_true = eval_data.get_label()
    weight = eval_data.get_weight()
    preds = preds.reshape(len(np.unique(y_true)), -1)
    preds = preds.argmax(axis = 0)
    f1 = f1_score(y_true,preds,average='macro',sample_weight=weight)
    return 'f1',f1,True

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    'objective': 'multiclass',
    'metric': None,
    'num_class': 2,
    "verbosity": -1,
    'seed': 42,
    "boosting_type": "gbdt",
}

In [13]:
best_score = 0 
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    # split
    print(f"Fold {fold + 1}")
    X_trn = X_train.iloc[trn_idx].copy()
    y_trn = y_train.iloc[trn_idx].copy()
    X_val = X_train.iloc[val_idx].copy()
    y_val = y_train.iloc[val_idx].copy()

    # preprocess
    X_trn[categorical_columns] = X_trn[categorical_columns].fillna("NaN")
    X_val[categorical_columns] = X_val[categorical_columns].fillna("NaN")
    oe_dict = dict()
    for col in categorical_columns:
        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X_trn.loc[:,col] = oe.fit_transform(X_trn.loc[:,col].values.reshape(-1, 1))
        X_val.loc[:,col] = oe.transform(X_val.loc[:,col].values.reshape(-1, 1))
        X_trn[col] = X_trn[col].astype(float)
        X_val[col] = X_val[col].astype(float)
        oe_dict[col] = oe
    X_trn = X_trn.drop(date_columns, axis=1)
    X_val = X_val.drop(date_columns, axis=1)

    # train
    train_set = lgb.Dataset(X_trn, y_trn)# ,weight=compute_sample_weight(class_weight='balanced',y=y_trn))
    val_set = lgb.Dataset(X_val, y_val, reference=train_set)
    model = lgb.train(
        params,
        train_set,
        num_boost_round=10000,
        valid_sets=[train_set, val_set],
        feval=mean_f1score,
    )

    # inference
    preds = model.predict(X_val)
    preds = preds.reshape(len(np.unique(y_val)), -1)
    preds = preds.argmax(axis = 0)
    print("0, 1: ",sum(preds==0), sum(preds==1))
    score = f1_score(y_val, preds, average='macro')
    if score > best_score:
        best_score = score
        best_oe = oe_dict
        best_model = model
    print(f"Score: {score}")
    print("-" * 100)

Fold 1
0, 1:  4231 4231
Score: 0.404923970637904
----------------------------------------------------------------------------------------------------
Fold 2
0, 1:  4231 4231
Score: 0.40827729881463337
----------------------------------------------------------------------------------------------------
Fold 3
0, 1:  4238 4223
Score: 0.40995707276863413
----------------------------------------------------------------------------------------------------
Fold 4
0, 1:  4248 4213
Score: 0.412832460811002
----------------------------------------------------------------------------------------------------
Fold 5
0, 1:  4224 4237
Score: 0.4101686965386072
----------------------------------------------------------------------------------------------------


In [14]:
# submission
X_test[categorical_columns] = X_test[categorical_columns].fillna("NaN")
for col in categorical_columns:
    oe = best_oe[col]
    X_test[col] = oe.transform(X_test[col].values.reshape(-1, 1))
    X_test[col] = X_test[col].astype(float)
X_test = X_test.drop(date_columns, axis=1)
preds = best_model.predict(X_test)
preds = preds.reshape(len(np.unique(y_train)), -1)
preds = preds.argmax(axis = 0)
preds = pd.DataFrame(preds, columns=["pred"], index=X_test.index)

In [15]:
preds["pred"].value_counts(normalize=True)

pred
0    0.5
1    0.5
Name: proportion, dtype: float64

In [18]:
result_dir = f"results/{FILE_NAME}"
if os.path.exists(result_dir):
    idx = 1
    result_dir += f"_v{idx}"
    while os.path.exists(result_dir):
        idx += 1
        result_dir = result_dir.split('_v')[0]
        result_dir += f"_v{idx}"
        
os.makedirs(result_dir, exist_ok=False)
shutil.copytree("lib", os.path.join(result_dir, "lib"))
shutil.copyfile(f"notebooks/{FILE_NAME}.ipynb", os.path.join(result_dir, f"{FILE_NAME}.ipynb"))
preds.to_csv(os.path.join(result_dir, f"submission.csv"), index=True, header=False)