In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import copy
import warnings
import joblib
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import datatable as dt
import gc
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
warnings.filterwarnings(action='ignore', category=UserWarning)
import os
import time
import random

# Custom Metrics

In [None]:
from sklearn.metrics import mean_squared_error
def my_metrics(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

# Config

In [None]:
folds = 7
seed_list = [i for i in range(20,21)]
early_stopping = 200

# Data

In [None]:
td = dt.fread("../input/tabular-playground-series-aug-2021/train.csv")
train = td.to_pandas()
del td
to_test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv").drop("id", axis = 1)

# Pseudo Labeling

files are from 
* https://www.kaggle.com/aayush26/tps-aug-2021-simple-weighted-ensemble by Aayush Kumar Singha
* https://www.kaggle.com/alexryzhkov/lightautoml-classifier-regressor-mix by Alexander Ryzhkov
* https://www.kaggle.com/somayyehgholami/3-tps-aug-21-results-rmse-evaluation by Somayyeh Gholami and Mehran Kazeminia

In [None]:
fake = pd.read_csv("../input/tps08-nn-file/7.85017.csv")
test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
test["loss"] = fake.loss
train = pd.concat([train, test]).reset_index(drop = True)

In [None]:
y = train.loss
X = train.drop(["id", "loss"], axis = 1)

# Hyperparameter Optimization

In [None]:
def objective(trial , X = X , y = y):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.47 , 0.5),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 0.32 , 0.33),
        'num_leaves' : trial.suggest_int('num_leaves' , 50 , 70),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.03 , 0.04),
        'max_depth' : trial.suggest_int('max_depth', 30 , 40),
        'n_estimators' : trial.suggest_int('n_estimators', 100 , 6100),  #  4000 , 5600
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.015 , 0.02),
        'subsample' : trial.suggest_uniform('subsample' , 0.7 , 1.0), # 0.7 , 1.0
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52 , 1), # 0.5 , 1
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'metric' : 'rmse', #'rmse'
        'device_type' : 'gpu',
    }
    #pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse', valid_name = 'valid_0')  
    score = 0
    for seed in seed_list: 
        kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
        for idx_train,idx_test in kf.split(X, y):
            X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
            y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]
            model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1)
            model.fit(X_train, y_train.values.ravel(), eval_set = [(X_test , y_test.values.ravel())] ,eval_metric = 'rmse', early_stopping_rounds = early_stopping, \
             verbose = 0, 
                      # callbacks = [pruning_callback]
                     ) 
            y_pred = model.predict(X_test)  
            score += (my_metrics(y_test.values.ravel(), y_pred) / folds) / len(seed_list)                 
    del model
    return score
import optuna
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm'
                        #   , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective , n_trials = 14)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)
print("done")
#time.sleep(60)

In [None]:
params = {'reg_alpha': 0.4844428103930398, 'reg_lambda': 0.3212929260836738,
          'num_leaves': 52, 'learning_rate': 0.033482626431611054, 'max_depth': 37,
          'n_estimators': 4040, 'min_child_weight': 0.017690546165775478, 'subsample': 0.836321023865464,
          'colsample_bytree': 0.8998887705274462,
          'min_child_samples': 80} # Best is trial 0 with value: 6.203933623898198.

# Fit

In [None]:
show_feature_importance = 1
show_time = 1
score = 0
if show_feature_importance:
    features_importance= pd.DataFrame({'Feature':[], 'Importance':[]})
    features = X.columns
for seed in seed_list: 
    kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
    count = 1
    for idx_train,idx_test in kf.split(X, y):
        print("=" * 40)
        print("seed", seed)
        print("fold", count)
        print("=" * 30)
        if show_time:
            start_time = time.time()
        X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
        y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]
        model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1, metric = 'rmse', device_type = 'gpu')
        model.fit(X_train, y_train, eval_set = [(X_test , y_test.values.ravel())], eval_metric = 'rmse',\
                  early_stopping_rounds = early_stopping, verbose = False)
        cv_score = my_metrics(y_test.values.ravel(), model.predict(X_test))
        score += (cv_score / folds) / len(seed_list)
        joblib.dump(model, f'LGBM seed_{seed}_fold_{count}_cv_score_{round(cv_score, 5)}.pkl') # save model
        if show_feature_importance:
            fold_importance_df= pd.DataFrame({'Feature':[], 'Importance':[]})
            fold_importance_df['Feature']= features
            fold_importance_df['Importance']= model.feature_importances_
            fold_importance_df["fold"] = count
            features_importance = pd.concat([features_importance, fold_importance_df], axis=0)
        if show_time:
            end_time = time.time()
            run_time = round(end_time - start_time)
            print ("fold", count, "took", run_time , "seconds to run")
            print ("The estimated remaining training time in the current seed", seed, "are",\
                   round(((folds - count) * run_time) / 60, 3), "minuets")        
        count += 1
        print("Validation score", cv_score)
print("Mean RMSPE validation score of", folds, "folds", score)


# Feature Importance

In [None]:
if show_feature_importance:
    import seaborn as sns
    from matplotlib import pyplot as plt
    feature_importance_df_ = features_importance
    cols = feature_importance_df_[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by="Importance", ascending=True)[:10].index
    best_features = feature_importance_df_[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by="Importance", ascending=True)[:10]
    best_features.reset_index(inplace=True)
    print(best_features.dtypes)
    plt.figure(figsize=(10, 4))
    sns.barplot(x="Importance", y="Feature", data=best_features)
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

In [None]:
if show_feature_importance:
    feature_importance_df_ = features_importance
    cols = feature_importance_df_[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by="Importance", ascending= False)[:10].index
    best_features = feature_importance_df_[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by="Importance", ascending=False)[:10]
    best_features.reset_index(inplace=True)
    print(best_features.dtypes)
    plt.figure(figsize=(10, 4))
    sns.barplot(x="Importance", y="Feature", data=best_features)
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

# Infer

In [None]:
output = []
# Mean RMSPE validation score of 7 folds 6.203931071616227
for filepath in glob.iglob('./*.pkl'):   
    model = joblib.load(filepath)
    pred = model.predict(to_test, num_iteration = model.best_iteration_)
    output.append(pred)
    del model
    del pred
y_pred = sum(output) / len(output)

In [None]:
final_pred = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
final_pred.loss = y_pred
final_pred

In [None]:
final_pred.to_csv('submission.csv',index=False)