In [None]:
import numpy as np 
import pandas as pd 
import re
import sys, gc, os
from IPython.display import display


#import shap
#shap.initjs()
#import featuretools as ft

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, Normalizer, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer ,LabelEncoder,
                                   OneHotEncoder,OrdinalEncoder)

import lightgbm as lgb

from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error

import optuna

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

SEED = 1991

In [None]:
target = df_train['target']
df_train.drop(['id', 'target'], axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

to_drop = target[target <= 4].index
target.drop(to_drop, inplace=True)
df_train.drop(to_drop, inplace=True)

plt.figure(figsize=(10,5))
sns.histplot(target, color='slategray', stat='frequency');

In [None]:
CAT = df_train.columns.tolist()[:10]
NUM = df_train.columns.tolist()[10:]
cols = df_train.columns.tolist()

ct = ColumnTransformer([('onehot',OrdinalEncoder(), CAT),
                        #('quantile',QuantileTransformer(random_state=SEED, n_quantiles=1500),NUM)
                        ('minmax', MinMaxScaler(), NUM)])

train_data = ct.fit_transform(df_train)
test_data = ct.transform(df_test)

train_data = pd.DataFrame(train_data, columns = cols)
test_data = pd.DataFrame(test_data, columns = cols)

train_data[CAT] = train_data[CAT] / 10
test_data[CAT] = test_data[CAT] / 10

In [None]:
train_data.head()

In [None]:
def basic_fe(df):
    
    # Manually multiply and drop specific columns
    
    #df['cont_003'] = df['cont0'] * df['cont8']

    df['cont001'] = df['cont8'] * df['cont0']
    df['cont002'] = df['cont9'] * df['cont0']
    df['cont003'] = df['cont9'] * df['cont5']
    df['cont004'] = df['cont8'] * df['cont5']
    #df['cont005'] = df['cont2'] * df['cont4']
    #df['cont006'] = df['cont1'] * df['cont3']
    #df['cont007'] = df['cont13'] * df['cont1']
    
    #df['cat001'] = df['cat2'] * df['cat1']
    #df['cat002'] = df['cat3'] * df['cat4']
    
    #df.drop('cont5', axis=1, inplace=True)
    #df.drop('cont9', axis=1, inplace=True)
    
    return df

In [None]:
train_data = basic_fe(train_data)
test_data = basic_fe(test_data)

print(train_data.shape, test_data.shape)

In [None]:
train_data.head()

### Optuna

In [None]:
def objective(trial,data=train_data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=SEED)
    
    params = {
        'metric': 'rmse', 
        'random_state': SEED,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.35,0.4,0.45,0.5,0.6,0.7,0.75,0.8,0.85]),
        'subsample': trial.suggest_categorical('subsample', [0.3,0.35,0.4,0.5,0.6,0.65,0.7,0.75,0.8,0.85]),
        'learning_rate': trial.suggest_categorical('learning_rate', 
                                                   [0.001,0.002,0.003,0.004,0.005,0.006,0.008,0.01,0.015,0.02,0.03]),
        'max_depth': trial.suggest_categorical('max_depth', [-1,10,20]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    model = lgb.LGBMRegressor(**params)  
    
    model.fit(train_x, train_y, eval_set=[(test_x,test_y)], early_stopping_rounds=300, verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds, squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=70)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
params = study.best_params   
params['random_state'] = SEED
params['n_estimators'] = 20000 
params['metric'] = 'rmse'

params['cat_smooth'] = params.pop('min_data_per_groups')

In [None]:
N_FOLDS = 7

kf = KFold(n_splits = N_FOLDS)
oof = np.zeros(len(target))

preds_folds_lgb = np.zeros(len(test_data))


for train_ind, test_ind in tqdm(kf.split(train_data)):
    X_train = train_data.iloc[train_ind]
    X_val = train_data.iloc[test_ind]
    y_train = target.iloc[train_ind]
    y_val = target.iloc[test_ind]

    model = lgb.LGBMRegressor(**params)
    
    model.fit(X_train, y_train, eval_set = ((X_val, y_val)), early_stopping_rounds = 300, verbose = 1000)
    p = model.predict(X_val)
    oof[test_ind] = p

    preds_folds_lgb += model.predict(test_data)/N_FOLDS

In [None]:
print(f'rmse on training data: {np.round(mean_squared_error(target, oof, squared=False),5)}')

In [None]:
df_sub['target'] = preds_folds_lgb
df_sub.to_csv('submission_lgb.csv', index=False)

In [None]:
lgb_params_kfold = {
     'reg_alpha': 5.2,
     'reg_lambda': 1.2,
     'min_data_in_leaf': 10,
     'colsample_bytree': 0.35,
     'subsample': 0.75,
     'learning_rate': 0.001,
     'max_depth': -10,
     'num_leaves': 120,
     'min_child_samples': 285,
     #'random_state':SEED,
     'verbose':-1,
     'n_estimators': 50000,
     'metric': 'rmse',
     'cat_smooth': 23
}