In [None]:
from collections import Counter
from typing import Tuple, List

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, plot_importance, early_stopping, log_evaluation
from dateparser import parse
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import SCORERS 
import matplotlib.pyplot as plt
# import optuna.integration.lightgbm as lgb
import optuna

In [None]:
TEST_PATH = "../data/test.csv"
TRAIN_PATH = "../data/train.csv"

In [None]:
test_df = pd.read_csv(TEST_PATH, sep=";")
train_df = pd.read_csv(TRAIN_PATH, sep=";")
print(train_df.shape, test_df.shape)

In [None]:
train_df.describe()

**Feature engineering**

In [None]:
Counter(train_df.dtypes), Counter(test_df.dtypes)

In [None]:
# define different groups of columns
target_column_name = "TARGET"
id_columns_names = ["id_contract", "id_client"]
bool_columns_names = ["IP_flag", "FLAG_DISQUALIFICATION", "EGRPOINCLUDED"]
date_columns_names = ["SIGN_DATE", "DATEFIRSTREG", "TAXREG_REGDATE", "TAXREGPAY_REGDATE", "BIRTHDATE"]

# FLAG_DISQUALIFICATION and BIRTHDATE are not good features
remove_columns_names = ["FLAG_DISQUALIFICATION", "BIRTHDATE"]

# float or int types, but categorial features
hand_cat_columns_names = ["OKATO_FED", "OKATO_REGIONCODE"]

real_columns_names = list(test_df.select_dtypes(include=['int64', 'float64']).columns)
for column_name in id_columns_names + bool_columns_names + hand_cat_columns_names + [target_column_name]:
    if column_name in real_columns_names:
        real_columns_names.remove(column_name)

# OKVED_CODE need preprocessing 
cat_columns_names = list(test_df.select_dtypes(include=['object']).columns) + hand_cat_columns_names
for column_name in date_columns_names:
    if column_name in cat_columns_names:
        cat_columns_names.remove(column_name)

for column_name in remove_columns_names:
    for columns_list in [bool_columns_names, date_columns_names, real_columns_names, cat_columns_names]:
        if column_name in columns_list:
            columns_list.remove(column_name)

In [None]:
def preproc_data(train_data: pd.DataFrame, test_data: pd.DataFrame, fill_real_nan: bool = True,
                 enable_fe: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
        returns:
            X_train, X_test, y_train
    """
    # get target
    y_train = train_data[target_column_name].values

    # # target by id feature
    # stat_df = train_data[["id_client", "TARGET"]].groupby(["id_client"]).mean().reset_index()
    # test_ids = set(test_data["id_client"].values)
    # stat_dict = dict(filter(lambda x: x[0] in test_ids, zip(stat_df["id_client"], stat_df["TARGET"])))
    # # train_id_column = pd.DataFrame({"id_target": train_data["id_client"].copy().apply(lambda x: \
    # #                                                  stat_dict[x] if x in stat_dict else None)})
    # test_id_column = pd.DataFrame({"id_target": test_data["id_client"].copy().apply(lambda x: \
    #                                                 stat_dict[x] if x in stat_dict else None)})
    # idxs = list(np.arange(len(train_df["id_client"])))

    # train_id_column = pd.DataFrame({
    #     "id_target": [train_df.iloc[idxs[:i]+idxs[i+1:],:][train_df.iloc[idxs[:i]+idxs[i+1:],:]["id_client"]==c_id]["TARGET"].mean() 
    #                   for i, c_id in enumerate(train_df["id_client"])]
    # })

    # check the order
    test_data = test_data[train_data.columns]

    # remove ids
    # train_data.drop(id_columns_names, inplace=True, axis=1)
    # test_data.drop(id_columns_names, inplace=True, axis=1)

    # boolean features
    train_bool_columns = train_data[bool_columns_names].copy().fillna(0).astype("int")
    test_bool_columns = test_data[bool_columns_names].copy().fillna(0).astype("int")


    # real features
    train_real_columns = train_data[real_columns_names].copy()
    test_real_columns = test_data[real_columns_names].copy()
    # fill NaN
    if fill_real_nan:
        test_real_columns = test_real_columns.fillna(train_real_columns.mean())
        train_real_columns = train_real_columns.fillna(train_real_columns.mean())
    # some feature generation...
    if enable_fe:
        _test_real_columns = test_real_columns.copy()
        _train_real_columns = train_real_columns.copy()
        for i, column in enumerate(_test_real_columns.columns):
            # print(test_df[column].shape, test_df.iloc[:, i:].shape)
            new_test_columns = _test_real_columns.iloc[:, i:].mul(_test_real_columns[column], axis=0)
            new_test_columns.columns = [x + "*" + column for x in _test_real_columns.columns[i:]]
            test_real_columns = pd.concat([test_real_columns, new_test_columns], axis=1)

            new_train_columns = _train_real_columns.iloc[:, i:].mul(_train_real_columns[column], axis=0)
            new_train_columns.columns = [x + "*" + column for x in _train_real_columns.columns[i:]]
            train_real_columns = pd.concat([train_real_columns, new_train_columns], axis=1)
    # normalization
    test_real_columns = (test_real_columns - train_real_columns.mean())/train_real_columns.std()
    train_real_columns = (train_real_columns - train_real_columns.mean())/train_real_columns.std()
    
    # categorial features
    train_cat_columns = train_data[cat_columns_names].copy()
    test_cat_columns = test_data[cat_columns_names].copy()
    # concat
    train_set_len = len(train_real_columns)
    cat_columns = pd.concat([train_cat_columns, test_cat_columns])
    # preprocessing for OKVED_CODE
    cat_columns["OKVED_CODE"] = cat_columns["OKVED_CODE"].apply(lambda x: str(x).split(".", maxsplit=1)[0])
    # fill NaN
    cat_columns = cat_columns.fillna("Unknown")
    # one-hot encoding
    cat_columns = pd.get_dummies(cat_columns)
    # deconcat
    train_cat_columns = cat_columns.iloc[:train_set_len]
    test_cat_columns = cat_columns.iloc[train_set_len:]
    
    # datetime features
    train_date_columns = train_data[date_columns_names].copy()
    test_date_columns = test_data[date_columns_names].copy()
    # getting year -> real feature
    _year_func = lambda x: str(x)[5:9] if str(x).strip() else None
    for date_column in train_date_columns.columns:
        train_date_columns[date_column] = train_date_columns[date_column].apply(_year_func) 
        test_date_columns[date_column] = test_date_columns[date_column].apply(_year_func)
    train_date_columns = train_date_columns.replace(r'^\s*$', np.nan, regex=True).astype("float")
    test_date_columns = test_date_columns.replace(r'^\s*$', np.nan, regex=True).astype("float")

    # X_train = pd.concat([train_id_column, train_bool_columns, train_real_columns, 
    #                      train_cat_columns, train_date_columns], axis=1)
    X_train = pd.concat([train_bool_columns, train_real_columns, 
                         train_cat_columns, train_date_columns], axis=1)
    # X_test = pd.concat([test_id_column, test_bool_columns, test_real_columns, 
    #                     test_cat_columns, test_date_columns], axis=1)
    X_test = pd.concat([test_bool_columns, test_real_columns, 
                        test_cat_columns, test_date_columns], axis=1)
    return X_train, X_test, y_train

In [None]:
X_train, X_test, y_train = preproc_data(train_df, test_df, fill_real_nan=True, enable_fe=False)

In [None]:
X_train.shape, X_test.shape, len(y_train)

In [None]:
Counter(y_train)

In [None]:
X_train.head()

**Models**

In [None]:
RANDOM_SEED = 5478435

In [None]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
optuna.logging.set_verbosity(1)

In [None]:
def tune(objective):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, show_progress_bar=False)

    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
    return params

In [None]:
def LGBM_objective(trial):
    _boosting_type = trial.suggest_categorical("boosting_type", ["dart", "goss", "gbdt"])
    _num_leaves = trial.suggest_int("num_leaves", 2, 256)
    _learning_rate = trial.suggest_float("learning_rate", 0.001, 1, log=True)
    _max_depth = trial.suggest_int("max_depth", 4, 16)
    _n_estimators = trial.suggest_int("n_estimators", 100, 800)
    # _feature_fraction = trial.suggest_uniform('feature_fraction', 0.4, 1.0),
    # _bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
    # _bagging_freq = trial.suggest_int('bagging_freq', 1, 7),
    # _min_child_samples = trial.suggest_int('min_child_samples', 5, 100),
    
    LGBM = LGBMClassifier(
        boosting_type=_boosting_type,
        num_leaves=_num_leaves,
        learning_rate=_learning_rate,
        max_depth=_max_depth,
        n_estimators=_n_estimators,
        # feature_fraction=_feature_fraction,
        # bagging_fraction=_bagging_fraction,
        # bagging_freq=_bagging_freq,
        # min_child_samples=_min_child_samples,
        random_state=RANDOM_SEED,
    )
    
    scores = cross_val_score(LGBM, X_train, y_train, cv=kfolds, scoring="neg_log_loss")
    return scores.mean()

In [None]:
LGBM_params = tune(LGBM_objective)

model = LGBMClassifier(**LGBM_params, random_state=RANDOM_SEED)

In [None]:
model = LGBMClassifier(**LGBM_params, random_state=RANDOM_SEED)

In [None]:
model.fit(X_train, y_train)

In [None]:
plt.rcParams["figure.figsize"] = (13,8)
plot_importance(model, max_num_features=20)
plt.show()

In [None]:
preds = model.predict_proba(X_test)[:, 1]

In [None]:
df = pd.DataFrame({'id_contract': test_df.id_contract.values, 'TARGET': preds})
df.to_csv('../submissions/subm_01.csv', sep=',', index=False) 