In [None]:
from collections import Counter
from typing import Tuple, List

import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm import plot_importance

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, SCORERS

import matplotlib.pyplot as plt

In [None]:
TEST_PATH = "../data/test.csv"
TRAIN_PATH = "../data/train.csv"

In [None]:
test_df = pd.read_csv(TEST_PATH, sep=";")
train_df = pd.read_csv(TRAIN_PATH, sep=";")
print(train_df.shape, test_df.shape)

In [None]:
train_df.describe()

**Feature engineering**

In [None]:
Counter(train_df.dtypes), Counter(test_df.dtypes)

In [None]:
# define different groups of columns
target_column_name = "TARGET"
id_columns_names = ["id_contract", "id_client"]
bool_columns_names = ["IP_flag", "FLAG_DISQUALIFICATION"]
date_columns_names = ["SIGN_DATE", "DATEFIRSTREG", "TAXREG_REGDATE", "TAXREGPAY_REGDATE", "BIRTHDATE"]

# FLAG_DISQUALIFICATION and BIRTHDATE are not good columns
remove_columns_names = ["FLAG_DISQUALIFICATION", "BIRTHDATE"]

real_columns_names = list(test_df.select_dtypes(include=['int64', 'float64']).columns)
for column_name in id_columns_names + bool_columns_names + [target_column_name]:
    real_columns_names.remove(column_name)

cat_columns_names = list(test_df.select_dtypes(include=['object']).columns)
for column_name in date_columns_names:
    cat_columns_names.remove(column_name)

for column_name in remove_columns_names:
    for columns_list in [bool_columns_names, date_columns_names, real_columns_names, cat_columns_names]:
        if column_name in columns_list:
            columns_list.remove(column_name)

In [None]:
def preproc_data(train_data: pd.DataFrame, test_data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
        returns:
            X_train, X_test, y_train
    """
    # get target
    y_train = train_data.pop(target_column_name).values

    # check the order
    test_data = test_data[train_data.columns]

    # remove ids
    # train_data.drop(id_columns_names, inplace=True, axis=1)
    # test_data.drop(id_columns_names, inplace=True, axis=1)

    # boolean features
    train_bool_columns = train_data[bool_columns_names].copy().fillna(0).astype("int")
    test_bool_columns = test_data[bool_columns_names].copy().fillna(0).astype("int")

    # real features
    train_real_columns = train_data[real_columns_names].copy()
    test_real_columns = test_data[real_columns_names].copy()
    # fill NaN
    # test_real_columns = test_real_columns.fillna(train_real_columns.mean())
    # train_real_columns = train_real_columns.fillna(train_real_columns.mean())
    # some feature generation...
    _test_real_columns = test_real_columns.copy()
    _train_real_columns = train_real_columns.copy()
    for i, column in enumerate(_test_real_columns.columns):
        # print(test_df[column].shape, test_df.iloc[:, i:].shape)
        new_test_columns = _test_real_columns.iloc[:, i:].mul(_test_real_columns[column], axis=0)
        new_test_columns.columns = [x + "*" + column for x in _test_real_columns.columns[i:]]
        test_real_columns = pd.concat([test_real_columns, new_test_columns], axis=1)

        new_train_columns = _train_real_columns.iloc[:, i:].mul(_train_real_columns[column], axis=0)
        new_train_columns.columns = [x + "*" + column for x in _train_real_columns.columns[i:]]
        train_real_columns = pd.concat([train_real_columns, new_train_columns], axis=1)
    # normalization
    test_real_columns = (test_real_columns - train_real_columns.mean())/train_real_columns.std()
    train_real_columns = (train_real_columns - train_real_columns.mean())/train_real_columns.std()
    
    # categorial features
    train_cat_columns = train_data[cat_columns_names].copy()
    test_cat_columns = test_data[cat_columns_names].copy()
    # concat
    train_set_len = len(train_real_columns)
    cat_columns = pd.concat([train_cat_columns, test_cat_columns])
    # fill NaN
    cat_columns = cat_columns.fillna("Unknown")
    # one-hot encoding
    cat_columns = pd.get_dummies(cat_columns)
    # deconcat
    train_cat_columns = cat_columns.iloc[:train_set_len]
    test_cat_columns = cat_columns.iloc[train_set_len:]
    
    # datetime features 
    # TODO

    X_train = pd.concat([train_bool_columns, train_real_columns, train_cat_columns], axis=1)
    X_test = pd.concat([test_bool_columns, test_real_columns, test_cat_columns], axis=1)
    return X_train, X_test, y_train

In [None]:
X_train, X_test, y_train = preproc_data(train_df, test_df)

In [None]:
X_train.shape, X_test.shape, len(y_train)

In [None]:
Counter(y_train)

In [None]:
X_train

**Models:**

In [None]:
RANDOM_SEED = 422

In [None]:
params_grid = {
    'min_child_weight': [1, 5, 10],
    # 'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [4, 6, 8, 10, 12, 14],
    'n_estimators': [100, 200, 500, 1000]
}

model = lgb.LGBMClassifier(random_state=RANDOM_SEED)

grid_cv = GridSearchCV(model, params_grid, cv=5, scoring="neg_log_loss")
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_params_, grid_cv.best_score_

In [None]:
model = lgb.LGBMClassifier(random_state=RANDOM_SEED, **grid_cv.best_params_)

model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:, 1]

In [None]:
plot_importance(model, max_num_features=20)

In [None]:
df = pd.DataFrame({'id_contract': test_df.id_contract.values, 'TARGET': preds})
df.to_csv('../submissions/subm_01.csv', sep=',', index=False) 