In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold , train_test_split
import optuna
import lightgbm

from sklearn.metrics import mean_squared_error


In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

In [None]:
features = [x for x in train_data.columns if 'cont' in x]
features

In [None]:
X_train = train_data[features]
y_train = train_data['target']
X_test = test_data[features]

In [None]:
def objective(trial , data = X_train , target = y_train):
    train_x , test_x , train_y , test_y = train_test_split(data , target , test_size = 0.2 , \
                                                           random_state = 42)
    params = {
        'device_type' : 'gpu',
        'max_depth' : trial.suggest_int('max_depth' ,10 , 20),
        'num_leaves' : trial.suggest_int('num_leaves' , 20 ,40),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.1),
        
        'subsample_for_bin' : trial.suggest_int('subsample_for_bin' , 100000 , 300000),
        'min_child_weight' : trial.suggest_int('min_child_weight' , 200 , 300),
        'subsample' : trial.suggest_uniform('subsample' , 0.1 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-3 , 1.0),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021,2077]),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-3 , 1.0),
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-3 , 1.0),
        
    }
    model = lightgbm.LGBMRegressor(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 100 , \
             verbose = False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(preds , test_y , squared = False)
    return rmse

In [None]:
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm')
study.optimize(objective , n_trials = 50)
print('numbers of the finished trials' , len(study.trials))
print('the best params' , study.best_trial.params)

In [None]:
# params = {
#     'max_depth': 10, 'n_estimators': 4863, 'learning_rate': 0.03030992669902076, \
#     'subsample_for_bin': 107883, 'min_child_weight': 300, 'subsample': 0.17069387486784737, \
#     'colsample_bytree': 0.3783631621474773, 'random_state': 2021, 'reg_lambda': 0.0033926293553395927, \
#     'reg_alpha': 0.1665707566403335
# }

In [None]:
# params = {'max_depth': 18, 'num_leaves': 28, 'n_estimators': 4293, \
#           'learning_rate': 0.025975762359679577, 'subsample_for_bin': 116813, \
#           'min_child_weight': 291, 'subsample': 0.4512216060271974, \
#           'colsample_bytree': 0.34973077020310595, 'random_state': 2021, \
#           'reg_lambda': 0.02864697146131836, 'reg_alpha': 0.10800305942657414}

In [None]:
params = {'max_depth': 15, 'num_leaves': 39, 'n_estimators': 9791, \
          'learning_rate': 0.00937327623882181, 'subsample_for_bin': 286933, \
          'min_child_weight': 298, 'subsample': 0.7651822433195676, \
          'colsample_bytree': 0.44027774911713297, 'random_state': 42, \
          'reg_lambda': 0.005316986016660236, 'reg_alpha': 0.0036773736709882756}

In [None]:
data = X_train
target = y_train
test = test_data

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits = 20 , random_state = 42 , shuffle = True)
rmse = []
n=0
for trn_idx , test_idx in kf.split(data , target):
    train_x , train_y = data.iloc[trn_idx] , target.iloc[trn_idx]
    val_x , val_y = data.iloc[test_idx] , target.iloc[test_idx]
    model = lightgbm.LGBMRegressor(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 100 , \
              verbose = False)
    
    preds += model.predict(test[features])/kf.n_splits
    rmse.append(mean_squared_error(model.predict(val_x) , val_y , squared = False))
    print(n+1 , rmse[n])
    n+=1

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')

In [None]:
sub['target'] = preds
sub.to_csv('lgbm3.csv' , index = False)
sub.head(10)

In [None]:
output = pd.DataFrame({"id" : test.id , "target":8})
output.head()

In [None]:
output.to_csv('quanshi8.csv' , index = False)