In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import seaborn as sns


def run_lgbm(X_train, y_train, group_df, categorical_cols=[]):
    y_preds = []
    models = []
    oof_train = np.zeros((len(X_train),))
    cv = GroupKFold(n_splits=5)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "num_leaves": 14,
        "max_depth": 4,
        "feature_fraction": 0.8,
        "subsample_freq": 1,
        "bagging_fraction": 0.7,
        "min_data_in_leaf": 10,
        "learning_rate": 0.2,
        "boosting": "gbdt",
        "lambda_l1": 0.4,
        "lambda_l2": 0.4,
        "verbosity": -1,
        "random_state": 42,
    }

    for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, groups=group_df)):
        X_tr = X_train.loc[train_index, :]
        X_val = X_train.loc[valid_index, :]
        y_tr = y_train[train_index]
        y_val = y_train[valid_index]

        lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_cols)

        lgb_eval = lgb.Dataset(
            X_val, y_val, reference=lgb_train, categorical_feature=categorical_cols
        )

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_eval],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)],
        )

        oof_train[valid_index] = model.predict(
            X_val, num_iteration=model.best_iteration
        )

        models.append(model)

    return oof_train, models


def visualize_importance(models, X_train):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importance()
        _df["column"] = X_train.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, _df], axis=0, ignore_index=True
        )

    order = (
        feature_importance_df.groupby("column")
        .sum()[["feature_importance"]]
        .sort_values("feature_importance", ascending=False)
        .index[:50]
    )

    fig, ax = plt.subplots(figsize=(max(6, len(order) * 0.4), 7))
    sns.boxenplot(
        data=feature_importance_df,
        x="column",
        y="feature_importance",
        order=order,
        ax=ax,
        palette="viridis",
    )
    ax.tick_params(axis="x", rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [None]:
train = pd.read_csv("../input/ubiquant-market-prediction/train.csv", nrows=1000000)
example_test = pd.read_csv("../input/ubiquant-market-prediction/example_test.csv")
example_sample_submission = pd.read_csv("../input/ubiquant-market-prediction/example_sample_submission.csv")
print(train.shape, example_test.shape, example_sample_submission.shape)

In [None]:
train.head()

In [None]:
example_test.head()

In [None]:
example_sample_submission.head()

In [None]:
X_train = train.drop(["target", "row_id", "time_id"], axis=1)
y_train = train["target"].values
group_df = train["time_id"]

In [None]:
oof_train, models = run_lgbm(X_train, y_train, group_df)

In [None]:
visualize_importance(models, X_train)

In [None]:
pd.DataFrame({"y_train": y_train, "oof_train": oof_train}).corr()["y_train"]["oof_train"]

In [None]:
import ubiquant


env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.drop(["row_id"], axis=1)
    y_preds = [model.predict(X_test, num_iteration=model.best_iteration) for model in models]
    sample_prediction_df["target"] = sum(y_preds) / len(y_preds)
    env.predict(sample_prediction_df)

In [None]:
sample_prediction_df.head()