In [1]:
!pip install lightgbm==3.1.1

Collecting lightgbm==3.1.1
  Using cached lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl (1.8 MB)
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.3.1
    Uninstalling lightgbm-2.3.1:
      Successfully uninstalled lightgbm-2.3.1
Successfully installed lightgbm-3.1.1


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
import gc
from tqdm.notebook import tqdm
pd.options.display.max_columns = 100

In [3]:
DATA_DIR = 'data/'

In [4]:
%%time
train_bureau = pd.read_csv(DATA_DIR+'train_bureau.csv')
test_bureau = pd.read_csv(DATA_DIR+'test_bureau.csv')
train = pd.read_csv(DATA_DIR+'train_Data.csv')
test = pd.read_csv(DATA_DIR+'test_Data.csv')
# data_dict = pd.read_csv(DATA_DIR+'data_dict.csv')
# ss = pd.read_csv(DATA_DIR+'sample_submission_ejm25Dc.csv')

CPU times: user 3.05 s, sys: 168 ms, total: 3.22 s
Wall time: 3.22 s


In [5]:
ID_COL, TARGET_COL = 'ID', 'Top-up Month'
target_mapper = pd.Series({'No Top-up Service': 0,
                           '12-18 Months': 1,
                           '18-24 Months': 2,
                           '24-30 Months': 3,
                           '30-36 Months': 4,
                           '36-48 Months': 5,
                           ' > 48 Months': 6,
 })
target_inv_mapper = pd.Series(index = target_mapper.values, data = target_mapper.index)
train[TARGET_COL] = train[TARGET_COL].map(target_mapper)

In [6]:
df = train.append(test).reset_index(drop = True)
date_cols = ['DisbursalDate', 'MaturityDAte', 'AuthDate']
df[date_cols] = df[date_cols].apply(lambda x: pd.to_datetime(x))

del df['AssetID']
gc.collect()

0

In [7]:
df_bureau = train_bureau.append(test_bureau).reset_index(drop = True)
df_bureau = df_bureau.drop_duplicates(['ID','DISBURSED-DT'],keep='first')
date_cols = ['DATE-REPORTED', 'DISBURSED-DT', 'CLOSE-DT', 'LAST-PAYMENT-DATE']
df_bureau[date_cols] = df_bureau[date_cols].apply(lambda x: pd.to_datetime(x, errors = 'coerce'))
df_bureau = df_bureau.sort_values(by = ['ID', 'DISBURSED-DT']).reset_index(drop = True)
df_bureau['app_dd'] = df_bureau['ID'].map(df.set_index('ID')['DisbursalDate'])

In [8]:
def clean(x):
    try:
        return float(str(x).replace(',', ''))
    except:
        return np.nan
    
df_bureau['DISBURSED-AMT/HIGH CREDIT'] = df_bureau['DISBURSED-AMT/HIGH CREDIT'].apply(lambda x: clean(x))
df_bureau['DISBURSED-DT_days_since_start'] = (df_bureau['DISBURSED-DT'] - df_bureau['DISBURSED-DT'].min()).dt.days

In [9]:
use_acct_types = ['Gold Loan', 'Personal', 'Tractor Loan', 'Overdraft', 'Business Loan Priority Sector  Agriculture', 
                 'Auto Loan (Personal)', 'Kisan Credit Card', 'Consumer Loan', 'Business Loan']

In [10]:
######## ACCT-TYPE features
use_acct_types = ['Gold Loan', 'Tractor Loan', 'Overdraft', 'Business Loan Priority Sector  Agriculture',
                 'Commercial Vehicle Loan']


for a in tqdm(use_acct_types):
    fltr = df_bureau['ACCT-TYPE'] == a
    df[f'ACCT-TYPE_{a}_DISBURSED_DT_max'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_start'].max())
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT'].max())
    df[f'ACCT-TYPE_{a}_DISBURSED_DT_max - DisbursalDate'] = (df['tmp'] - df['DisbursalDate']).dt.days
    
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_max'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].max())
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last - DisbursalAmount'] = df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last'] - df['DisbursalAmount']
    
    
    
del df['tmp']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [11]:
for a in tqdm(use_acct_types):
    fltr = df_bureau['ACCT-TYPE'] == a
    df[f'ACCT-TYPE_{a}_DISBURSED_DT_max'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_start'].max())
    fltr2 = fltr & (df_bureau['DISBURSED-DT'] > df_bureau['app_dd'])
    
    df[f'ACCT-TYPE_{a}_DISBURSED_DT_next_after_loan'] = df['ID'].map(df_bureau[fltr2].groupby('ID')['DISBURSED-DT_days_since_start'].first())
    df['tmp'] = df['ID'].map(df_bureau[df_bureau['DISBURSED-DT'] == df_bureau['app_dd']].set_index('ID')['DISBURSED-DT_days_since_start'])
    
    f = f'ACCT-TYPE_{a}_DISBURSED_DT_next_after_loan - DisbursalDate'
    df[f] = (df[f'ACCT-TYPE_{a}_DISBURSED_DT_next_after_loan'] - df['tmp'])     
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT'].max())
    df[f'ACCT-TYPE_{a}_DISBURSED_DT_max - DisbursalDate'] = (df['tmp'] - df['DisbursalDate']).dt.days

    
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_max'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].max())
    df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last - DisbursalAmount'] = df[f'ACCT-TYPE_{a}_DISBURSED-AMT/HIGH CREDIT_last'] - df['DisbursalAmount']
    
    
del df['tmp']
gc.collect()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




50

In [12]:
tmp = df_bureau[df_bureau['DISBURSED-DT'] > df_bureau['app_dd']]

f = f'TOTAL_DISBURSED-AMT/HIGH CREDIT_after_DisbursalDate'
df[f] = df['ID'].map(tmp.groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].sum())

f = f'MAX_DISBURSED-AMT/HIGH CREDIT_after_DisbursalDate'
df[f] = df['ID'].map(tmp.groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].max())

f = f'LAST_DISBURSED-AMT/HIGH CREDIT'
df[f] = df['ID'].map(df_bureau.groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())

In [13]:
df_bureau['first_DISBURSED-DT'] = df_bureau.groupby("ID")['DISBURSED-DT'].transform('first')
df_bureau['DISBURSED-DT_days_since_first_loan'] = (df_bureau['DISBURSED-DT'] - df_bureau['first_DISBURSED-DT']).dt.days
df_bureau['DISBURSED-DT_days_since_DisbursalDate'] = (df_bureau['DISBURSED-DT'] - df_bureau['app_dd']).dt.days

In [14]:
for a in tqdm(df_bureau['SELF-INDICATOR'].unique()):
    fltr = df_bureau['SELF-INDICATOR'] == a
    df[f'SELF-INDICATOR-TYPE_{a}_DISBURSED_DT_max'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_start'].max())
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT'].max())
    df[f'SELF-INDICATOR_{a}_DISBURSED_DT_max - DisbursalDate'] = (df['tmp'] - df['DisbursalDate']).dt.days
    
    df[f'SELF-INDICATOR_{a}_DISBURSED-AMT/HIGH CREDIT_last'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())

    
del df['tmp']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [15]:
for s in tqdm(df_bureau['SELF-INDICATOR'].unique()):
    for c in tqdm(['Personal Loan', 'Gold Loan', 'Kisan Credit Card', 'Consumer Loan']):
        fltr = (df_bureau['SELF-INDICATOR'] == s) & (df_bureau[f'ACCT-TYPE'] == c)
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_max_days_since_first_loan'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_first_loan'].max())
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_max_days_since_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].max())
        
        fltr_add = (df_bureau['DISBURSED-DT'] > df_bureau['app_dd']) & fltr
        
        
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_just_after_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr_add].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(0))
        
        
        
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_2nd_last_days_since_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-2))             
        
        
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_3rd_last_days_since_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-3))
        
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_4th_last_days_since_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-4))
        
        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_DISBURSED_DT_5th_last_days_since_DisbursalDate'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-5))

        df[f'SELF-INDICATOR-TYPE_{a}_ACCT-TYPE_{c}_last_DISBURSED-AMT/HIGH CREDIT'] = \
        df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))





In [16]:
for s in tqdm(df_bureau['SELF-INDICATOR'].unique()):

    fltr = (df_bureau['SELF-INDICATOR'] == s) & (df_bureau[f'ACCT-TYPE'].isin(['Personal Loan', 'Gold Loan']))
    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_max_days_since_first_loan'] = df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_first_loan'].max())

    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_max_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].max())


    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_just_after_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr_add].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(0))


    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_2nd_last_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-2))

    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_3rd_last_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-3))

    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_4th_last_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-4))
    
    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_DISBURSED_DT_5th_last_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(-5))

    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan_last_DISBURSED-AMT/HIGH CREDIT'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].last())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [17]:
for s in tqdm(df_bureau['SELF-INDICATOR'].unique()):

    fltr = (df_bureau['SELF-INDICATOR'] == s) & (df_bureau[f'ACCT-TYPE'].isin(['Business Loan Priority Sector  Agriculture', 'Tractor Loan']))

    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan2_DISBURSED_DT_max_days_since_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].max())


    df[f'SELF-INDICATOR-TYPE_{a}_Non Tractor Loan2_DISBURSED_DT_just_after_DisbursalDate'] = \
    df['ID'].map(df_bureau[fltr_add].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(0))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [18]:
col = 'ACCOUNT-STATUS'
for a in ['Tractor Loan', 'Gold Loan']:
    for o in tqdm(df_bureau[col].unique()):
        for s in df_bureau['SELF-INDICATOR'].unique():

            fltr = (df_bureau['SELF-INDICATOR'] == s) & (df_bureau[f'ACCT-TYPE'] == a) & (df_bureau[col] == o)

            df[f'ACCT_TYPE_{a}_SELF-INDICATOR_{s}_{col}_{o}_DISBURSED_DT_max_days_since_DisbursalDate'] = \
            df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].max())


            df[f'ACCT_TYPE_{a}_SELF-INDICATOR_{s}_{col}_{o}_DISBURSED_DT_just_after_DisbursalDate'] = \
            df['ID'].map(df_bureau[fltr_add].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(0))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [19]:
col = 'CONTRIBUTOR-TYPE'
for a in ['Tractor Loan', 'Gold Loan']:
    for o in tqdm(df_bureau[col].unique()):
        for s in df_bureau['SELF-INDICATOR'].unique():

            fltr = (df_bureau['SELF-INDICATOR'] == s) & (df_bureau[f'ACCT-TYPE'] == a) & (df_bureau[col] == o)

            df[f'ACCT_TYPE_{a}_SELF-INDICATOR_{s}_{col}_{o}_DISBURSED_DT_max_days_since_DisbursalDate'] = \
            df['ID'].map(df_bureau[fltr].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].max())


            df[f'ACCT_TYPE_{a}_SELF-INDICATOR_{s}_{col}_{o}_DISBURSED_DT_just_after_DisbursalDate'] = \
            df['ID'].map(df_bureau[fltr_add].groupby('ID')['DISBURSED-DT_days_since_DisbursalDate'].nth(0))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [20]:
for i in range(1, 3):
    fltr = df_bureau['DISBURSED-DT'] >= df_bureau['app_dd']
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby("ID")['DISBURSED-DT'].nth(i))
    df[f'DisbursalDT - DISBURSED-DT_next_{i}'] = (df['DisbursalDate'] - df['tmp']).dt.days
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby("ID")['DISBURSED-DT'].nth(-i))
    df[f'DisbursalDT - DISBURSED-DT_last_{i}'] = (df['DisbursalDate'] - df['tmp']).dt.days
    
    df['tmp'] = df['ID'].map(df_bureau[fltr].groupby("ID")['DISBURSED-AMT/HIGH CREDIT'].nth(-i))
    df[f'DisbursalAmount - DISBURSED-AMT/HIGH CREDIT_last_{i}'] = (df['DisbursalAmount'] - df['tmp'])
    
del df['tmp']

In [21]:
tmp = df_bureau[df_bureau['DISBURSED-DT'] > df_bureau['app_dd']]
for c in tqdm(['Personal Loan', 'Gold Loan', 'Kisan Credit Card', 'Consumer Loan', 'Business Loan']):
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_mean'] = df['ID'].map(tmp[tmp['ACCT-TYPE'] == c].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].mean())
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_max'] = df['ID'].map(tmp[tmp['ACCT-TYPE'] == c].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].max())
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_first'] = df['ID'].map(tmp[tmp['ACCT-TYPE'] == c].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].first())
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_second'] = df['ID'].map(tmp[tmp['ACCT-TYPE'] == c].groupby('ID')['DISBURSED-AMT/HIGH CREDIT'].nth(1))
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_max - DisbursalAmount'] = df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_max'] - df['DisbursalAmount']
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_first - DisbursalAmount'] = df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_first'] - df['DisbursalAmount']
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_second - DisbursalAmount'] = df[f'ACCT-TYPE_{c}_after_DisbursalDate_DISBURSED-AMT/HIGH CREDIT_second'] - df['DisbursalAmount']
    df[f'ACCT-TYPE_{c}_after_DisbursalDate_total_loans'] = df['ID'].map(tmp[tmp['ACCT-TYPE'] == c].groupby('ID').size())
    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [22]:
df['ID_freq'] = df['ID'].map(df_bureau['ID'].value_counts())
df['ID_freq'] = (df['ID_freq'] == 1)

In [23]:
fltr = df_bureau['app_dd'] == df_bureau['DISBURSED-DT']
df_bureau['total_loan_days'] = (df_bureau['CLOSE-DT'] - df_bureau['DISBURSED-DT']).dt.days
df['total_loan_days'] = df['ID'].map(df_bureau[fltr].set_index('ID')['total_loan_days'])
df['total_loan_days_max'] = df['ID'].map(df_bureau.groupby('ID')['total_loan_days'].max())
df['total_loan_days_min'] = df['ID'].map(df_bureau.groupby('ID')['total_loan_days'].min())
df['total_loan_days_first'] = df['ID'].map(df_bureau.groupby('ID')['total_loan_days'].first())
df['total_loan_days_last'] = df['ID'].map(df_bureau.groupby('ID')['total_loan_days'].last())
df['total_loan_days_range'] = df['total_loan_days_max'] - df['total_loan_days_min']

In [24]:
df['DisbursalDate - MaturityDAte'] = (df['DisbursalDate'] - df['MaturityDAte']).dt.days

In [25]:
for c in ['ACCT-TYPE', 'DISBURSED-AMT/HIGH CREDIT']:
    df[c + '_first'] = df['ID'].map(df_bureau.groupby('ID')[c].first())
    df[c + '_last'] = df['ID'].map(df_bureau.groupby('ID')[c].last())
    df[c + '_second_last'] = df['ID'].map(df_bureau.groupby('ID')[c].nth(-2))
    
    
df['ACCT-TYPE'] = df['ID'].map(df_bureau[df_bureau['DISBURSED-DT'] > df_bureau['app_dd']].groupby("ID")['ACCT-TYPE'].first())

In [26]:
fltr = df_bureau['DISBURSED-DT'] > df_bureau['app_dd']
tmp = df_bureau[fltr]
for c in df_bureau['SELF-INDICATOR'].unique():
    df_bureau['tmp'] = (df_bureau['SELF-INDICATOR'] == c)*1
    df[f'mean_{c}_all'] = df['ID'].map(df_bureau.groupby('ID')['tmp'].mean())
    df[f'sum_{c}_all'] = df['ID'].map(df_bureau.groupby('ID')['tmp'].sum())

In [27]:
cat_cols = df.head().select_dtypes('object').columns.tolist()
cat_cols = [c for c in cat_cols if c not in ['DisbursalDate', 'MaturityDAte', 'AuthDate']]
print(cat_cols)
for c in cat_cols:
    df[c] = pd.factorize(df[c])[0]
    


['Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode', 'Area', 'SEX', 'City', 'State', 'ACCT-TYPE_first', 'ACCT-TYPE_last', 'ACCT-TYPE_second_last', 'ACCT-TYPE']


In [28]:
vc = (df.isnull().sum()/df.shape[0]).sort_values(ascending = False)
to_drop = vc[vc == 1].index.tolist()
len(to_drop)

63

In [29]:
train, test = df[:train.shape[0]].reset_index(drop = True), df[train.shape[0]:].reset_index(drop = True)
print(train.shape, test.shape)

fts = [c for c in train.columns if c not in [ID_COL,TARGET_COL, 'DisbursalDate', 'MaturityDAte', 'AuthDate'] + to_drop]
print(len(fts))

(128655, 352) (14745, 352)
284


In [31]:
def lgb_f1_score(y_true, y_pred):
    y_pred = y_pred.reshape(7, -1).T
    y_pred = y_pred.argmax(axis=1) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_pred, average='macro'), True

In [32]:
np.random.seed(52)
N_SPLITS = 11
folds = StratifiedKFold(N_SPLITS, shuffle = True, random_state = 2)
oofs = np.zeros((len(train), 7))
preds = np.zeros((len(test), 7))
fi_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    
    X_trn, y_trn = train[fts].iloc[trn_idx], train[TARGET_COL].iloc[trn_idx]
    X_val, y_val = train[fts].iloc[val_idx], train[TARGET_COL].iloc[val_idx]
    
    params = {
        'learning_rate': np.random.choice([0.05, 0.06, 0.08, 0.04, 0.1])/2,
        'colsample_bytree': np.random.choice([0.25, 0.5, 0.65, 0.8, 0.95, 0.7, 0.68]),
        'reg_alpha': np.random.choice([0.22, 0.85, 0.65, 0.7, 0.1]),
        'reg_lambda': np.random.choice([0.7, 0.1, 0.22, 0.85, 0.65, 0.05]),
        'max_depth': np.random.choice([4, 6, 8, 12, -1]),
        'n_estimators': 10000000000,
        'metric': 'None',
        'subsample_for_bin': 100000000000,
        'objective':'binary',
        'num_leaves': 61,
        'min_child_samples': 10,
    }

    print(params['max_depth'])
    
    clf = LGBMClassifier(**params)
    clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], verbose = 50, early_stopping_rounds = 300, eval_metric = lgb_f1_score)
    
    vp = clf.predict_proba(X_val)
    val_score = round(f1_score(y_val, vp.argmax(axis = 1), average = 'macro'), 5)
    
    print(f'\nFold {fold_+1} Val score: {val_score}\n')
    
    tp = clf.predict_proba(test[fts])
    
    oofs[val_idx] = vp
    preds += tp/N_SPLITS
        
    fi_df = fi_df.append(pd.DataFrame({'fold': fold_, 'feature': fts, 'importance': clf.feature_importances_}))
    
oof_score = round(f1_score(train[TARGET_COL], oofs.argmax(axis=1), average = 'macro'), 6)
print(f'\nOOF Auc is : {oof_score}')

8
Training until validation scores don't improve for 300 rounds
[50]	valid_0's f1: 0.711039
[100]	valid_0's f1: 0.716937
[150]	valid_0's f1: 0.715414
[200]	valid_0's f1: 0.719044
[250]	valid_0's f1: 0.718292
[300]	valid_0's f1: 0.720048
[350]	valid_0's f1: 0.71725
[400]	valid_0's f1: 0.718888
[450]	valid_0's f1: 0.721683
[500]	valid_0's f1: 0.719671
[550]	valid_0's f1: 0.719729
[600]	valid_0's f1: 0.719162
[650]	valid_0's f1: 0.720899
[700]	valid_0's f1: 0.7196
[750]	valid_0's f1: 0.71893
Early stopping, best iteration is:
[458]	valid_0's f1: 0.722205

Fold 1 Val score: 0.7222

12
Training until validation scores don't improve for 300 rounds
[50]	valid_0's f1: 0.71359
[100]	valid_0's f1: 0.732159
[150]	valid_0's f1: 0.735049
[200]	valid_0's f1: 0.737695
[250]	valid_0's f1: 0.738638
[300]	valid_0's f1: 0.735943
[350]	valid_0's f1: 0.734269
[400]	valid_0's f1: 0.735387
[450]	valid_0's f1: 0.737282
[500]	valid_0's f1: 0.73727
Early stopping, best iteration is:
[242]	valid_0's f1: 0.740716

In [33]:
oof_score = round(f1_score(train[TARGET_COL], oofs.argmax(axis=1), average = 'macro'), 6)
print(f'\nOOF Auc is : {oof_score}')


OOF Auc is : 0.73041


In [34]:
train2 = oofs.copy()
test2 = preds.copy()


In [35]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    
    X_trn, y_trn = train2[trn_idx], train[TARGET_COL].iloc[trn_idx]
    X_val, y_val = train2[val_idx], train[TARGET_COL].iloc[val_idx]
    
    params = {
        'learning_rate': 0.02,#np.random.choice([0.05, 0.06, 0.08, 0.04, 0.1])/5,
        'colsample_bytree': 0.8,#np.random.choice([0.25, 0.5, 0.65, 0.8, 0.95, 0.7, 0.68]),
        'reg_alpha': 0.3,#np.random.choice([0.22, 0.85, 0.65, 0.7, 0.1]),
        'reg_lambda': 0.3,#np.random.choice([0.7, 0.1, 0.22, 0.85, 0.65, 0.05]),
        'max_depth': -1,#np.random.choice([4, 6, 8, 12, -1]),
        'n_estimators': 10000000000,
        'metric': 'None',
        'subsample_for_bin': 100000000000,
        'objective':'binary',
        'num_leaves': 31,
        'min_child_samples': 7,
    }

    
    clf = LGBMClassifier(**params)
    clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], verbose = 50, early_stopping_rounds = 200, eval_metric = lgb_f1_score)
    
    vp = clf.predict_proba(X_val)
    val_score = round(f1_score(y_val, vp.argmax(axis = 1), average = 'macro'), 5)
    
    print(f'\nFold {fold_+1} Val score: {val_score}\n')
    
    tp = clf.predict_proba(test2)
    
    oofs[val_idx] = vp
    preds += tp/N_SPLITS

    
oof_score = round(f1_score(train[TARGET_COL], oofs.argmax(axis=1), average = 'macro'), 6)
print(f'\nOOF Auc is : {oof_score}')

Training until validation scores don't improve for 200 rounds
[50]	valid_0's f1: 0.715424
[100]	valid_0's f1: 0.726623
[150]	valid_0's f1: 0.725932
[200]	valid_0's f1: 0.727193
[250]	valid_0's f1: 0.725912
[300]	valid_0's f1: 0.725957
[350]	valid_0's f1: 0.725906
Early stopping, best iteration is:
[193]	valid_0's f1: 0.72727

Fold 1 Val score: 0.72727

Training until validation scores don't improve for 200 rounds
[50]	valid_0's f1: 0.736366
[100]	valid_0's f1: 0.733128
[150]	valid_0's f1: 0.732927
[200]	valid_0's f1: 0.735188
[250]	valid_0's f1: 0.735297
Early stopping, best iteration is:
[53]	valid_0's f1: 0.73845

Fold 2 Val score: 0.73845

Training until validation scores don't improve for 200 rounds
[50]	valid_0's f1: 0.717226
[100]	valid_0's f1: 0.721783
[150]	valid_0's f1: 0.722363
[200]	valid_0's f1: 0.720736
[250]	valid_0's f1: 0.720244
[300]	valid_0's f1: 0.719309
Early stopping, best iteration is:
[112]	valid_0's f1: 0.723476

Fold 3 Val score: 0.72348

Training until validat

In [36]:
tmp = oofs.copy()
oof_score = f1_score(train[TARGET_COL], tmp.round(4).argmax(axis=1), average = 'macro')
print(oof_score)
pd.DataFrame(oofs).round(4).to_csv('oofs_seed_2_nikhil_cv_0.733.csv')

0.7333645335429434


In [37]:
tmp = preds.copy()
pred_labels = pd.Series(tmp.argmax(axis = 1))
preds_actual = pred_labels.map(target_inv_mapper)
pd.DataFrame(preds).round(4).to_csv('preds_seed_2_nikhil_cv_0.733.csv')