In [None]:
import glob
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import copy
import warnings
import joblib
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import datatable as dt
import gc
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
import os
import time
import math
def my_metrics(y_true, y_pred):
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y_true[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y_true))) ** 0.5    
def rmsle(y_true, y_pred):     
    output = my_metrics(y_true, y_pred)
    return 'rmsle', output, False
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared = False)
warnings.filterwarnings(action='ignore', category=UserWarning)

In [None]:
folds = 5
seed_list = [i for i in range(14, 15)]
early_stopping = 200

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
all_data = pd.concat([train, test])
all_data['date_time'] = pd.to_datetime(all_data['date_time'])
all_data['date_time'] = (all_data['date_time'].dt.year * 365 + all_data['date_time'].dt.month * 30 +\
                all_data['date_time'].dt.day + all_data['date_time'].dt.hour / 24) * 24
all_data["date_time"] = (2021 * 365 + 7 * 30 + 1) * 24 - all_data.date_time
train, test = all_data.iloc[:len(train),], all_data.iloc[len(train):,]
target = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
to_test = test.drop(target, axis = 1)
y = train[target]
X = train.drop(target, axis = 1)

In [None]:
y_saved = y.copy()

In [None]:
def objective(trial , X = X , y = y_saved.iloc[:,0]):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.04 , 1),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 3 , 9),
        'num_leaves' : trial.suggest_int('num_leaves' , 40 , 80),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.01 , 0.05),
        'max_depth' : trial.suggest_int('max_depth', 30 , 60),
        'n_estimators' : trial.suggest_int('n_estimators', 3000 , 8000),  #  4000 , 5600
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.04 , 0.1),
        'subsample' : trial.suggest_uniform('subsample' , 0.7 , 1.0), # 0.7 , 1.0
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.5 , 1), # 0.5 , 1
        'min_child_samples' : trial.suggest_int('min_child_samples', 30, 60),
        'metric' : 'rmse', #'rmse'
        'device_type' : 'gpu',
    }
  #  pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmspe', valid_name = 'valid_0')  
    score = 0
    for seed in seed_list: 
        kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
        for idx_train,idx_test in kf.split(X, y):
            X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
            y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]
            model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1)
            model.fit(X_train, y_train.values.ravel(), eval_set = [(X_test , y_test.values.ravel())] ,eval_metric = rmsle, early_stopping_rounds = early_stopping, \
             verbose = 300
                   #    ,callbacks = [pruning_callback]
                     ) 
            y_pred = model.predict(X_test)  
            score += (my_metrics(y_test.values.ravel(), y_pred) / folds) / len(seed_list)                 
    del model
    return score
import optuna
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm'
                           # , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective , n_trials = 100)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)
print("done")
#time.sleep(60)

In [None]:
params = {'reg_alpha': 0.9983348199602602, 'reg_lambda': 3.7977035151026133,
 'num_leaves': 43, 
 'learning_rate': 0.018110216441437378, 'max_depth': 43,
 'n_estimators': 4465, 'min_child_weight': 0.08971736720140458, 
 'subsample': 0.8077831485462772, 'colsample_bytree': 0.5101790470372234, 
 'min_child_samples': 33} # Best is trial 26 with value: 0.10700507123161773.

In [None]:
y = y_saved.iloc[:,0]
score = 0
for seed in seed_list: 
    kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
    count = 1
    for idx_train,idx_test in kf.split(X, y):
        print("=" * 40)
        print("seed", seed)
        print("fold", count)
        print("=" * 30)
        start_time = time.time()
        X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
        y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]
        model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1, metric = 'rmse', device_type = 'gpu')
        model.fit(X_train, y_train, eval_set = [(X_test , y_test.values.ravel())], eval_metric = rmsle,\
                  early_stopping_rounds = early_stopping, verbose = False)
        cv_score = my_metrics(y_test.values.ravel(), model.predict(X_test))
        score += (cv_score / folds) / len(seed_list)
        joblib.dump(model, f'target 0 seed_{seed}_fold_{count}_cv_score_{round(cv_score, 3)}.pkl') # save model
        end_time = time.time()
        run_time = round(end_time - start_time)
        print ("fold", count, "took", run_time , "seconds to run")
        count += 1
        print ("The estimated remaining training time in the current seed", seed, "are",\
               round(((folds - count) * run_time) / 60, 3), "minuets")
        print("Validation score", cv_score)
print("Mean RMSPE validation score of", folds, "folds", score)

In [None]:
def objective(trial , X = X , y = y_saved.iloc[:, 1]):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.04 , 1),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 3 , 9),
        'num_leaves' : trial.suggest_int('num_leaves' , 40 , 80),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.01 , 0.05),
        'max_depth' : trial.suggest_int('max_depth', 30 , 60),
        'n_estimators' : trial.suggest_int('n_estimators', 3000 , 8000),  #  4000 , 5600
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.04 , 0.1),
        'subsample' : trial.suggest_uniform('subsample' , 0.7 , 1.0), # 0.7 , 1.0
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.5 , 1), # 0.5 , 1
        'min_child_samples' : trial.suggest_int('min_child_samples', 30, 60),
        'metric' : 'rmse', #'rmse'
        'device_type' : 'gpu',
    }
  #  pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmspe', valid_name = 'valid_0')  
    score = 0
    for seed in seed_list: 
        kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
        for idx_train,idx_test in kf.split(X, y):
            X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
            y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]
            model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1)
            model.fit(X_train, y_train.values.ravel(), eval_set = [(X_test , y_test.values.ravel())] ,eval_metric = rmsle, early_stopping_rounds = early_stopping, \
             verbose = 300
                   #    ,callbacks = [pruning_callback]
                     ) 
            y_pred = model.predict(X_test)  
            score += (my_metrics(y_test.values.ravel(), y_pred) / folds) / len(seed_list)                 
    del model
    return score
import optuna
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm'
                           # , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective , n_trials = 33)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)
print("done")
#time.sleep(60)

In [None]:
params2 = {'reg_alpha': 0.08954418283105867, 
           'reg_lambda': 4.34037322003328, 'num_leaves': 69, 
           'learning_rate': 0.01605276385426573, 'max_depth': 42, 
           'n_estimators': 6484, 'min_child_weight': 0.05758104491837273,
           'subsample': 0.769111408198851, 'colsample_bytree': 0.6678960189167248, 
           'min_child_samples': 30} #. Best is trial 29 with value: 0.08656920519236028.

In [None]:
y = y_saved.iloc[:, 1]
score = 0
for seed in seed_list: 
    kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
    count = 1
    for idx_train,idx_test in kf.split(X, y):
        print("=" * 40)
        print("seed", seed)
        print("fold", count)
        print("=" * 30)
        start_time = time.time()
        X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
        y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]
        model = lgb.LGBMRegressor(**params2, random_state = seed, n_jobs = -1, metric = 'rmse', device_type = 'gpu')
        model.fit(X_train, y_train, eval_set = [(X_test , y_test.values.ravel())], eval_metric = rmsle,\
                  early_stopping_rounds = early_stopping, verbose = False)
        cv_score = my_metrics(y_test.values.ravel(), model.predict(X_test))
        score += (cv_score / folds) / len(seed_list)
        joblib.dump(model, f'target 1 seed_{seed}_fold_{count}_cv_score_{round(cv_score, 3)}.pkl') # save model
        end_time = time.time()
        run_time = round(end_time - start_time)
        print ("fold", count, "took", run_time , "seconds to run")
        count += 1
        print ("The estimated remaining training time in the current seed", seed, "are",\
               round(((folds - count) * run_time) / 60, 3), "minuets")
        print("Validation score", cv_score)
print("Mean RMSPE validation score of", folds, "folds", score)

In [None]:
def objective(trial , X = X , y = y_saved.iloc[:, 2]):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.04 , 1),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 3 , 9),
        'num_leaves' : trial.suggest_int('num_leaves' , 40 , 80),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.01 , 0.05),
        'max_depth' : trial.suggest_int('max_depth', 30 , 60),
        'n_estimators' : trial.suggest_int('n_estimators', 3000 , 8000),  #  4000 , 5600
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.04 , 0.1),
        'subsample' : trial.suggest_uniform('subsample' , 0.7 , 1.0), # 0.7 , 1.0
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.5 , 1), # 0.5 , 1
        'min_child_samples' : trial.suggest_int('min_child_samples', 30, 60),
        'metric' : 'rmse', #'rmse'
        'device_type' : 'gpu',
    }
  #  pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmspe', valid_name = 'valid_0')  
    score = 0
    for seed in seed_list: 
        kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
        for idx_train,idx_test in kf.split(X, y):
            X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
            y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]
            model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1)
            model.fit(X_train, y_train.values.ravel(), eval_set = [(X_test , y_test.values.ravel())] ,eval_metric = 'rmse', early_stopping_rounds = early_stopping, \
             verbose = 3000
                   #    ,callbacks = [pruning_callback]
                     ) 
            y_pred = model.predict(X_test)  
            score += (rmse(y_test.values.ravel(), y_pred) / folds) / len(seed_list)                 
    del model
    return score
import optuna
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm'
                           # , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective , n_trials = 33)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)
print("done")
#time.sleep(60)

In [None]:
params3 = {'reg_alpha': 0.05950215618354974, 'reg_lambda': 4.9053234921173186, 
           'num_leaves': 53, 'learning_rate': 0.013992638769108515, 
           'max_depth': 41, 'n_estimators': 4509, 'min_child_weight': 0.08516375010639021,
           'subsample': 0.8804548923956849, 'colsample_bytree': 0.6436210260217696, 'min_child_samples': 31}
#the best value: 53.542532685728304   rmse

In [None]:
y = y_saved.iloc[:, 2]
score = 0
for seed in seed_list: 
    kf = KFold(n_splits = folds, random_state= seed, shuffle=True)
    count = 1
    for idx_train,idx_test in kf.split(X, y):
        print("=" * 40)
        print("seed", seed)
        print("fold", count)
        print("=" * 30)
        start_time = time.time()
        X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
        y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]
        model = lgb.LGBMRegressor(**params3, random_state = seed, n_jobs = -1, metric = 'rmse', device_type = 'gpu')
        model.fit(X_train, y_train, eval_set = [(X_test , y_test.values.ravel())], eval_metric = 'rmse',\
                  early_stopping_rounds = early_stopping, verbose = False)
        cv_score = rmse(y_test.values.ravel(), model.predict(X_test))
        score += (cv_score / folds) / len(seed_list)
        joblib.dump(model, f'target 2 seed_{seed}_fold_{count}_cv_score_{round(cv_score, 3)}.pkl') # save model
        end_time = time.time()
        run_time = round(end_time - start_time)
        print ("fold", count, "took", run_time , "seconds to run")
        count += 1
        print ("The estimated remaining training time in the current seed", seed, "are",\
               round(((folds - count) * run_time) / 60, 3), "minuets")
        print("Validation score", cv_score)
print("Mean RMSPE validation score of", folds, "folds", score)

In [None]:
output = []
for filepath in glob.iglob('../input/tps-july-models-target-0/*.pkl'):
    model = joblib.load(filepath)
    pred = model.predict(to_test, num_iteration = model.best_iteration_)
    output.append(pred)
    del model
    del pred
p0 = sum(output) / len(output)
output = []
for filepath in glob.iglob('../input/tps-july-model-target-1/*.pkl'):
    model = joblib.load(filepath)
    pred = model.predict(to_test, num_iteration = model.best_iteration_)
    output.append(pred)
    del model
    del pred
p1 = sum(output) / len(output)
for filepath in glob.iglob('../input/tps-july-model-target-2/*.pkl'):
    model = joblib.load(filepath)
    pred = model.predict(to_test, num_iteration = model.best_iteration_)
    output.append(pred)
    del model
    del pred
p2 = sum(output) / len(output)

In [None]:
output = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
label = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
output[label[0]] = p0
output[label[1]] = p1
output[label[2]] = p2
output.loc[0, label] = [1.4, 4.1, 186.5]

In [None]:
output.to_csv('submission.csv',index=False)