# First Model

In this notebook, we create a simple model using LightGBM. The features included in this model are:
- all float (or int but not category) variables as it is:
    - `RevLineCr', 'NoEmp', 'CreateJob', `RetainedJob`, `ApprovalFY`. `DisbursementGross`, `GrAppv`, `SBA_Appv`
- some categorical variables as it is:
    - `NewExist`, `RevLineCr`, `LowDoc`, `UrbanRural`
- Some date objects as daystamp:
    - `DisbursementDate_daystamp`, `ApprovalDate_daystamp`
- Some categorical varibles with coarse labeling:
    - `FranchiseCode`(0,1,or others)
- Some categorical variables with holdout target encoding:
    - `Sector`, `State`, `BankState`

Note that `City` is not used in this model

In [152]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import  KFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import lightgbm as lgb

In [153]:
# read data
data = pd.read_csv(f"/Users/yuke/Documents/Codes/DSC/DSC_STUDY/FinDataChallenge/takazawa/data/train.csv", index_col=0)

In [154]:
# convert data as in eda.ipynb
# import addfips
# af = addfips.AddFIPS()

categorical_cols = ['FranchiseCode','RevLineCr', 'LowDoc', 'Sector', 'UrbanRural', 'NewExist']
date_cols = ["DisbursementDate", "ApprovalDate"]
dollar_cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
for col in categorical_cols:
    data[col] = data[col].astype('category')
    if data[col].isnull().sum():
        data[col] = data[col].cat.add_categories("NAN").fillna("NAN")
for col in date_cols:
    data[col] = pd.to_datetime(data[col], format="%d-%b-%y")
    # add date cols
    data[col + "_year"] = pd.DatetimeIndex(data[col]).year
    data[col + "_month"] = pd.DatetimeIndex(data[col]).month
    data[col + "_day"] = pd.DatetimeIndex(data[col]).day
    data[col + "_daystamp"] = (data[col] - data[col].min()).dt.days
for col in dollar_cols:
    data[col] = data[col].str.replace("[$,]", "", regex=True)
    data[col] = data[col].astype(float)

## I want to run Codes below but currently not possible due to access limit??
# all_states = data['State'].to_numpy()
# all_state_fips = [af.get_state_fips(item) for item in all_states]
# data['State_FIPS'] = all_state_fips
# county_fips = [county_FIPS(item['City'], item['State'], item['State_FIPS']) for i, item in data.iterrows()]
# data['County_FIPS'] = county_fips

In [155]:
data.dtypes

Term                                  int64
NoEmp                                 int64
NewExist                           category
CreateJob                             int64
RetainedJob                           int64
FranchiseCode                      category
RevLineCr                          category
LowDoc                             category
DisbursementDate             datetime64[ns]
MIS_Status                            int64
Sector                             category
ApprovalDate                 datetime64[ns]
ApprovalFY                            int64
City                                 object
State                                object
BankState                            object
DisbursementGross                   float64
GrAppv                              float64
SBA_Appv                            float64
UrbanRural                         category
DisbursementDate_year               float64
DisbursementDate_month              float64
DisbursementDate_day            

In [156]:
num_cols = ['NoEmp', 'CreateJob', 'RetainedJob', 'ApprovalFY', 'DisbursementGross', 'GrAppv', 'SBA_Appv']
retained_cat_cols = ['NewExist', 'RevLineCr', 'LowDoc', 'UrbanRural']
timestamp_cols = ['DisbursementDate_daystamp', 'ApprovalDate_daystamp']

# coarse franchise col
data['FranchiseCode1'] = (data['FranchiseCode']==1).astype("category")
data['FranchiseCode0'] = (data['FranchiseCode']==1).astype("category")
franchise_cols = ['FranchiseCode1', 'FranchiseCode0']

In [157]:
# devide into training data and test data
X = data.drop("MIS_Status", axis=1)
y = data["MIS_Status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [180]:
# target encoding: function
def Holdout_target_encoding(X,y, column,folds):
  df = X; df['target']=y
  df[column + "_target"] = 0.9
  tmp = df[[column, column + "_target"]]
  for idx1, idx2 in folds:
    train = df.iloc[idx1]
    #val = df.iloc[idx2]
    mean = train.groupby(column)['target'].mean()
    for ind, v in tmp.iloc[idx2].iterrows():
      try:
        tmp.loc[ind,column+"_target"] = mean.loc[tmp.loc[ind, column]]
      except:
        continue
  df[column+ "_target"] = tmp[column + "_target"]
  return df

In [184]:
def target_encode_test(train_X, train_y, test_X, column):
    df = train_X; df['target'] = train_y
    mean = train_X.groupby(column)['target'].mean()
    test_X[column + "_target"] = 0.9
    for ind in mean.index:
        test_X.loc[test_X[column] == ind, column + "_target"] = mean[ind]
    return test_X

In [185]:
target_encode_cols = ['Sector', 'State', 'BankState']
kf = KFold(n_splits=3, shuffle=True, random_state=1000)
# for train
kf_iter_train = kf.split(X_train)
folds_train = []
for train_idx, test_idx in kf.split(X_train):
    folds_train.append((train_idx, test_idx))
for col in target_encode_cols:
    X_train = Holdout_target_encoding(X_train, y_train, col, folds_train)
# for validation, we use target encoding of train data
for col in target_encode_cols:
    X_test = target_encode_test(X_train, y_train, X_test, col)
target_encoded_cols = [item + "_target" for item in target_encode_cols]

In [186]:
all_cols = num_cols + retained_cat_cols + timestamp_cols + franchise_cols + target_encoded_cols
X_train[all_cols].head()

Unnamed: 0,NoEmp,CreateJob,RetainedJob,ApprovalFY,DisbursementGross,GrAppv,SBA_Appv,NewExist,RevLineCr,LowDoc,UrbanRural,DisbursementDate_daystamp,ApprovalDate_daystamp,FranchiseCode1,FranchiseCode0,Sector_target,State_target,BankState_target
6863,2,0,0,2000,75000.0,75000.0,63750.0,1.0,N,N,1,11713.0,9505,False,False,0.83703,0.900193,0.873469
30454,1,0,0,1998,286000.0,286000.0,286000.0,2.0,0,N,0,9765.0,8829,False,False,0.882296,0.902727,0.920869
8111,1,0,10,1998,50000.0,50000.0,25000.0,1.0,Y,N,1,13391.0,9112,True,True,0.912308,0.942085,0.916484
22811,5,0,0,1995,4000.0,4000.0,3400.0,1.0,0,A,0,7907.0,7947,False,False,0.914838,0.885714,0.895833
4428,3,0,0,2006,40000.0,40000.0,32000.0,1.0,N,N,0,10951.0,11990,False,False,0.83703,0.914692,0.928109


In [187]:
def Macrof1(preds, eval_dataset):
    y_true = eval_dataset.get_label()
    y_pred = (preds>0.5).astype(int)
    score = f1_score(y_true, y_pred, average='macro')
    return 'Macrof1', score, True

## LightGBM

In [190]:
params = {
    'objective': 'binary',
    'metric': 'custom',  # Use custom to use the custom metric for evaluation
    'verbose': 1,
}
dataset = lgb.Dataset(X_train[all_cols], label=y_train)

# Define CV parameters
cv_results = lgb.cv(
    params,
    dataset,
    num_boost_round=1000,
    nfold=5,
    feval=Macrof1,  # Custom evaluation function
    stratified=False,
    seed=42,
)

[LightGBM] [Info] Number of positive: 24180, number of negative: 2896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1979
[LightGBM] [Info] Number of data points in the train set: 27076, number of used features: 18
[LightGBM] [Info] Number of positive: 24128, number of negative: 2948
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1979
[LightGBM] [Info] Number of data points in the train set: 27076, number of used features: 18
[LightGBM] [Info] Number of positive: 24157, number of negative: 2919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [191]:
cv_results['valid Macrof1-mean']

[0.4714761015334649,
 0.4714761015334649,
 0.4714761015334649,
 0.4714761015334649,
 0.4714761015334649,
 0.4714761015334649,
 0.47390416664770213,
 0.4841575308308891,
 0.5021155078704285,
 0.5193846760740946,
 0.5335591475791671,
 0.557697972831976,
 0.5946696652641943,
 0.6124640716585642,
 0.6186372492145982,
 0.624315621021919,
 0.6271811227790562,
 0.631707042151707,
 0.6317210314366559,
 0.6344367979307284,
 0.6351014344503148,
 0.6363364008595997,
 0.637444188615001,
 0.6397325507178875,
 0.6405662612659442,
 0.6408304629849193,
 0.6417989050365803,
 0.6417997259476971,
 0.642595264981003,
 0.64335250853935,
 0.6438728031146447,
 0.6446812658097889,
 0.6444218028204404,
 0.645248515475851,
 0.6449523425043957,
 0.6452634286394696,
 0.6449910189785788,
 0.6452454110194353,
 0.6452885642530576,
 0.6455566322580933,
 0.645789152030275,
 0.6454332184864059,
 0.6454870885462916,
 0.6459950412137283,
 0.646116997380334,
 0.646426440731094,
 0.6464576371061336,
 0.6474370316198245,
 0

## Training with all data

In [197]:
# target encode
target_encode_cols = ['Sector', 'State', 'BankState']
kf = KFold(n_splits=3, shuffle=True, random_state=100)
folds = []
for train_idx, test_idx in kf.split(X):
    folds.append((train_idx, test_idx))
for col in target_encode_cols:
    data = Holdout_target_encoding(data,data['MIS_Status'],col,folds)

In [198]:
data

Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,...,ApprovalDate_year,ApprovalDate_month,ApprovalDate_day,ApprovalDate_daystamp,FranchiseCode1,FranchiseCode0,target,Sector_target,State_target,BankState_target
0,163,21,1.0,0,0,1,N,N,1998-01-31,1,...,2006,9,22,12028,True,True,1,0.940968,0.926978,0.938422
1,84,6,1.0,4,0,0,0,N,1993-10-31,1,...,1992,6,30,6831,False,False,1,0.900000,0.904899,0.913333
2,242,45,1.0,4,90,0,N,N,2001-08-31,1,...,2001,4,18,10045,False,False,1,0.897122,0.957386,0.970803
3,237,4,1.0,0,0,0,N,N,2007-08-31,1,...,2003,10,6,10946,False,False,1,0.913703,0.930355,0.940112
4,184,0,1.0,0,0,0,N,N,1983-06-08,1,...,1999,12,17,9557,False,False,1,0.941393,0.880574,0.877026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,283,14,1.0,0,0,1,N,N,1998-01-31,1,...,1995,3,2,7806,True,True,1,0.938835,0.908805,0.887340
42303,53,2,1.0,0,0,0,Y,N,1991-04-03,1,...,2007,6,6,12285,False,False,1,0.897089,0.879886,0.943820
42304,59,6,2.0,0,0,1,N,N,2003-02-28,1,...,2003,3,14,10740,True,True,1,0.897122,0.894410,0.875810
42305,295,18,1.0,0,8,0,N,N,1997-12-10,1,...,1989,8,23,5789,False,False,1,0.897122,0.807867,0.785448


In [200]:
params = {
    'objective': 'binary',
    'metric': 'custom',  # Use custom to use the custom metric for evaluation
    'verbose': 1,
}
dataset = lgb.Dataset(data[all_cols], label=data['MIS_Status'])

# Define CV parameters
cv_results = lgb.cv(
    params,
    dataset,
    num_boost_round=100,
    nfold=5,
    feval=Macrof1,  # Custom evaluation function
    stratified=False,
    seed=42,
    return_cvbooster=True,
)

[LightGBM] [Info] Number of positive: 30191, number of negative: 3653
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1996
[LightGBM] [Info] Number of data points in the train set: 33844, number of used features: 18
[LightGBM] [Info] Number of positive: 30222, number of negative: 3622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1996
[LightGBM] [Info] Number of data points in the train set: 33844, number of used features: 18
[LightGBM] [Info] Number of positive: 30198, number of negative: 3646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [204]:
boosters = cv_results['cvbooster'].boosters

In [220]:
# make encoding for test data
testdata=pd.read_csv("../data/test.csv", index_col=0)
categorical_cols = ['FranchiseCode','RevLineCr', 'LowDoc', 'Sector', 'UrbanRural', 'NewExist']
date_cols = ["DisbursementDate", "ApprovalDate"]
dollar_cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
for col in categorical_cols:
    testdata[col] = testdata[col].astype('category')
    if testdata[col].isnull().sum():
        testdata[col] = testdata[col].cat.add_categories("NAN").fillna("NAN")
for col in date_cols:
    testdata[col] = pd.to_datetime(testdata[col], format="%d-%b-%y")
    # add date cols
    testdata[col + "_year"] = pd.DatetimeIndex(testdata[col]).year
    testdata[col + "_month"] = pd.DatetimeIndex(testdata[col]).month
    testdata[col + "_day"] = pd.DatetimeIndex(testdata[col]).day
    testdata[col + "_daystamp"] = (testdata[col] - testdata[col].min()).dt.days
for col in dollar_cols:
    testdata[col] = testdata[col].str.replace("[$,]", "", regex=True)
    testdata[col] = testdata[col].astype(float)

testdata['FranchiseCode1'] = (testdata['FranchiseCode']==1).astype("category")
testdata['FranchiseCode0'] = (testdata['FranchiseCode']==1).astype("category")


test_X = testdata
for col in target_encode_cols:
    test_X = target_encode_test(data, data['MIS_Status'], test_X, col)

In [225]:
pred_per_cv = [item.predict(test_X[all_cols]) for item in boosters]
pred_average = np.mean(pred_per_cv, axis=0)
testdata['predict'] = (pred_average > 0.5).astype(int)

In [226]:
testdata['predict'].to_csv("fisrtmodel.csv", header=False)

In [227]:
print(testdata)

       Term  NoEmp NewExist  CreateJob  RetainedJob FranchiseCode RevLineCr  \
42307     5      2      1.0          1            0             0         T   
42308   235     13      1.0          9           14         77725         Y   
42309    31      5      2.0          0            0             0         N   
42310   120      4      1.0          0            1             0         Y   
42311    63     13      1.0          0            8             1         N   
...     ...    ...      ...        ...          ...           ...       ...   
84610   243     10      1.0          3           14             0         N   
84611   178      0      2.0          0            0             1         N   
84612    42      1      2.0          3            9             0         Y   
84613    76     15      1.0          0            0             0         N   
84614    35      3      2.0          1            4         18150         Y   

      LowDoc DisbursementDate Sector  ... ApprovalD