# Load files

In [None]:
filename_train = '../input/cat-in-the-dat-ii/train.csv'
filename_test  = '../input/cat-in-the-dat-ii/test.csv'

In [None]:
import pandas as pd

df_train = pd.read_csv(filename_train, index_col='id')
df_test  = pd.read_csv(filename_test, index_col='id')

In [None]:
df_train

In [None]:
df_test

# Preprocessing

## Explanatory variables

In [None]:
obj_col = 'target'

In [None]:
X_train = df_train.drop([obj_col], axis=1)
X_train

In [None]:
X_test = df_test
X_test

## Objective variables

In [None]:
y_train = df_train[obj_col]
y_train

## Change object-type to category-type

In [None]:
def to_category(df):
    for col, type in zip(df.columns, df.dtypes):
        if type == object:
            df[col] = df[col].astype('category')
    
    return df

In [None]:
X_train = to_category(X_train)
X_test  = to_category(X_test)

# Modeling

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

def acc_score(preds, data):
    y_true = data.get_label()
    y_pred = np.where(preds > 0.5, 1, 0)
    acc = accuracy_score(y_true, y_pred)
    return 'acc', acc, True

lgb_train = lgb.Dataset(X_train, y_train)

params = {
    'objective': 'binary',    
    'min_data_in_leaf': 15,  
    'force_row_wise': True,
}

cv_rslt = lgb.cv(params, 
                 lgb_train, 
                 nfold=5, 
                 num_boost_round=10000, 
                 early_stopping_rounds=20,               
                 feval=acc_score,
                 return_cvbooster=True)

In [None]:
cvbooster = cv_rslt['cvbooster']

In [None]:
cv_rslt['acc-mean']

# Prediction

In [None]:
y_preds = cvbooster.predict(X_test, num_iteration=cvbooster.best_iteration)
y_pred = np.mean(y_preds, axis=0)
y_pred = pd.Series(y_pred, name='target')
y_pred

In [None]:
answer = pd.concat([df_test.index.to_series().reset_index(drop=True), y_pred], axis=1)
answer

In [None]:
filename_output = './submission.csv'
answer.to_csv(filename_output, index=False)