In [None]:
import gc
from itertools import cycle
import random

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import interp
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from optuna.integration import lightgbm as lgb
#import lightgbm as lgb

In [None]:
def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)

SEED = 42
fix_seed(SEED)

In [None]:
!ls ../input/tabular-playground-series-feb-2021

# Load data

In [None]:
DATA = "../input/tabular-playground-series-feb-2021/"
train = pd.read_csv(DATA + "train.csv")
test = pd.read_csv(DATA + "test.csv")

sub = pd.read_csv(DATA + "sample_submission.csv")

In [None]:
train.info()

In [None]:
train.head()

In [None]:
test.info()

In [None]:
test.head()

# Feature engineering

In [None]:
dataset = pd.concat([train, test])

In [None]:
train_cat_cols = [f"cat{i}" for i in range(10)]
train_num_col = [f"cont{i}" for i in range(14)]

In [None]:
for col in train_cat_cols:
    le = LabelEncoder()
    le.fit(dataset[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
X = train[[col for col in train.columns if col in (train_cat_cols + train_num_col)]]
y = train["target"]

In [None]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
X_test = X_test.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True)

# Train

In [None]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

In [None]:
d_training = lgb.Dataset(X, label=y,
                         categorical_feature=train_cat_cols, free_raw_data=False)
d_test = lgb.Dataset(X_test, label=y_test,
                         categorical_feature=train_cat_cols, free_raw_data=False)

In [None]:
model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_test], verbose_eval=25, early_stopping_rounds=50)

In [None]:
def print_tuned_params(model):
    print("---------------------")
    print("params:", model.params)
    print("best_iteration:", model.best_iteration)
    print("best_score:", model.best_score)    
    print("---------------------")

print_tuned_params(model)

In [None]:
lgb.plot_importance(model, max_num_features=15, figsize=(10,10))
plt.show()

# Inference

In [None]:
pred = model.predict(test[(train_cat_cols + train_num_col)])
sub["target"] = pred
sub.to_csv('sub.csv', index=False)