In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
from math import sqrt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [None]:
RANDOM_STATE = 2021
CROSS_VALIDATION = 3

In [None]:
data_dir = '/kaggle/input/tabular-playground-series-jan-2021'

In [None]:
df = pd.read_csv(f"{data_dir}/train.csv").set_index('id').convert_dtypes()  #.sample(frac=0.01)
display(df.shape)
df.head(2)

**Split the data.**  
80% for train and 20% for test.

In [None]:
X = df.copy()
y = X.pop('target')
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=RANDOM_STATE,
)

**Preprocessor pipeline.**  
**'imputer'**: filling nan with the default imputer 'mean'.  
**'log'**: transform all features with log.  
**'scalar'**: standardize features with z = (x - mean) / std.  

In [None]:
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler()),
])

**Create Pipeline with following steps.**  
**'preprocessor'**: preprocessor pipeline.  
**'variance_drop'**: removes all low-variance features.  
**'voting'**: `passthrough` the models will come later on.

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('variance_drop', VarianceThreshold(threshold=(0.95 * (1 - 0.95)))),
    ('voting', 'passthrough'),
])

**Prepare parameters for GridSearch**  
**'voting'**: voting regressor with `LGBM` and `XGB`.  

In [None]:
# For efficency running with minimal parameters.
# The parameters here is just for an example.
parameters = [
    {
        'voting': [VotingRegressor([
            ('lgbm', LGBMRegressor(random_state=RANDOM_STATE)),
            ('xgb', XGBRegressor(random_state=RANDOM_STATE))
        ])],
        
        # LGBM
        'voting__lgbm__n_estimators': [2000], # range(500, 3000, 1000),
        'voting__lgbm__max_depth': [12], # range(4, 16, 4),
        'voting__lgbm__learning_rate': [0.01],
        'voting__lgbm__num_leaves': [256],
        'voting__lgbm__min_child_weight': [12],
        'voting__lgbm__feature_fraction': [0.4],  # np.arange(0.1, 1, 0.1),
        'voting__lgbm__bagging_fraction': [0.7],  # np.arange(0.1, 1, 0.1),
        'voting__lgbm__bagging_freq': [5],
        'voting__lgbm__min_child_samples': [32],
        'voting__lgbm__lambda_l1':[9],
        'voting__lgbm__lambda_l2': [0.13],               
        
        # XGBM
        'voting__xgb__n_estimators': [2000],  # range(500, 3000, 1000),
        'voting__xgb__max_depth': [12],  # range(4, 16, 4),
        'voting__xgb__learning_rate': [0.01],
        'voting__xgb__alpha': [5],
        'voting__xgb__gamma': [3],
        'voting__xgb__lambda': [3],
        'voting__xgb__subsample': [0.8],
        'voting__xgb__colsample_bytree': [0.4],
    }
]

In [None]:
total = CROSS_VALIDATION * len(ParameterGrid(parameters))
display(f"Number of combination that will be run by the GridSearch: {total}")

In [None]:
custom_scoring = make_scorer(
    score_func=lambda y, y_pred: mean_squared_error(y, y_pred, squared=False),
    greater_is_better=False,
)

In [None]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=parameters,
    cv=CROSS_VALIDATION,
    scoring=custom_scoring,
    n_jobs=-1,
    verbose=True,
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
display(abs(grid_search.best_score_))
display(grid_search.best_params_)

Check our model on the validation data.

In [None]:
preds = grid_search.best_estimator_.predict(X_valid)
mean_squared_error(y_valid, preds, squared=False)

# Submission

In [None]:
X_test = pd.read_csv(f"{data_dir}/test.csv").set_index('id').convert_dtypes()
display(X_test.shape)
X_test.head(2)

In [None]:
preds_test = grid_search.best_estimator_.predict(X_test)
output = pd.DataFrame(
    {'Id': X_test.index, 'target': preds_test})
output.to_csv(f"submission.csv", index=False)