In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgbm
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split,KFold
from sklearn.ensemble import StackingRegressor,GradientBoostingRegressor,RandomForestRegressor
import optuna

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
data.drop(columns='id',axis=1,inplace=True)
data.describe()

# Data Visualization and Cleaning

Let's take a look at the boxplot of each feature and target.

In [None]:
all_col_data = [data[col] for col in data.columns]
all_col_data.pop(-1)
fig,ax = plt.subplots(figsize=(14,10))
ax.boxplot(all_col_data)
ax.set_xticklabels(data.columns[:-1])
ax.set_title('boxplot before cleaning')
original_len = len(data)

There are some outliers in features 'cont7', 'cont9', 'cont10'

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
ax.boxplot(data['target'])
ax.set_title('boxplot of target')

target=0 seems to be an extreme outlier.

In [None]:
data = data[data.target!=0]

**Attention: This cleaning method will lead to bad result.**

In [None]:
IQR_dict = defaultdict(list)
for cont in ['cont7','cont9','cont10']:
    Q1 = data[cont].quantile(0.25)
    Q3 = data[cont].quantile(0.75)
    IQR = Q3 - Q1  
    IQR_dict[cont] = [Q1,Q3,IQR]
    
for key,value in IQR_dict.items():
    myfilter = (data[key] >= value[0] - 1.5 * value[2]) & (data[key] <= value[1] + 1.5 *value[2])
    data = data.loc[myfilter] 

In [None]:
all_col_data = [data[col] for col in data.columns]
all_col_data.pop(-1)
fig,ax = plt.subplots(figsize=(14,10))
ax.boxplot(all_col_data)
ax.set_xticklabels(data.columns[:-1])
ax.set_title('boxplot after cleaning')
cleaned_len = len(data)

Clean data under the rule of 1.5*interquantile range

In [None]:
print('%s rows of data is dropped'% (original_len-cleaned_len))

Let's also take a look at the box plot of target after cleaning.

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
ax.boxplot(data['target'])
ax.set_title('boxplot of target')

Check the correlation matrix.

In [None]:
#x = data.drop(columns='target',axis=1)
corr = data.corr()
fig,ax = plt.subplots(figsize=(12,10))
sns.heatmap(corr,ax=ax,vmin=-1, vmax=1, cmap='coolwarm', annot=True)
plt.yticks(rotation=0,fontsize=13)
plt.xticks(rotation=90,fontsize=13)
ax.set_title('correlation heatmap',fontsize=14)

Features 'cont1', 'cont6'~'cont13' have relatively high correlation with each other. 

In [None]:
columns = np.full(shape=(corr.shape[0],), fill_value=True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if abs(corr.iloc[i,j]) >= 0.9:
            if columns[j]:
                columns[j] = False
sel_columns = data.columns[columns]
data = data[sel_columns]
data.columns

No column are dropped due to high absoulte correlation > 0.9

# Baseline Model

## XGBoost

In [None]:
def xgb_model_pipeline(data):
    #train_x,test_x,train_y,test_y = train_test_split(data,test_size=0.2,random_state=42)
    X = data.drop(columns='target',axis=1)
    y = data['target']

    ##Caculate E_cv (cross validation error)
    kf = KFold(n_splits=5,shuffle=True,random_state=42)
    fold_error = []
    for fold,(train_idx,val_idx) in enumerate(kf.split(X)):
        train_X,val_X = X.iloc[train_idx,:],X.iloc[val_idx,:]
        train_y,val_y = y.iloc[train_idx],y.iloc[val_idx]
        
        print('Fold %s:'% (fold))
        xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.1, max_depth=8,seed=42,verbosity=1)
        xgb_model.fit(train_X,train_y, eval_metric= 'rmse',
              eval_set=[(train_X,train_y),(val_X,val_y)], early_stopping_rounds=5,verbose=False)
        pred_y = xgb_model.predict(val_X)
        in_fold_rmse = np.sqrt(np.mean((val_y-pred_y)**2))
        print('Fold %s rmse: %s\n' % (fold,in_fold_rmse))
        fold_error.append(in_fold_rmse)
        
    oof_rmse = np.sum(fold_error)/len(fold_error)
    print('E_cv: %s' % (oof_rmse))

    ##Train on the whole training data
    xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.1, max_depth=8,seed=42,verbosity=1)
    xgb_model.fit(X,y,verbose=False)
    
    ##Plot feature importance
    fig,ax = plt.subplots(figsize=(10,8))
    xgb.plot_importance(xgb_model, ax=ax, importance_type='gain')
    plt.yticks(fontsize=14)
    
    return xgb_model

In [None]:
xgb_model = xgb_model_pipeline(data)

## LightGBM

In [None]:
def lightgbm_pipeline(data):
    X = data.drop(columns='target',axis=1)
    y = data['target']

    ##Caculate E_cv (cross validation error)
    kf = KFold(n_splits=5,shuffle=True,random_state=42)
    fold_error = []
    for fold,(train_idx,val_idx) in enumerate(kf.split(X)):
        train_X,val_X = X.iloc[train_idx,:],X.iloc[val_idx,:]
        train_y,val_y = y.iloc[train_idx],y.iloc[val_idx]
        
        print('Fold %s:'% (fold))
        lgbm_model = LGBMRegressor(n_estimators=1000, learning_rate=0.1, 
                                   max_depth=8,random_state=42,verbosity=-1)
        lgbm_model.fit(train_X,train_y, eval_metric= 'rmse',
              eval_set=[(val_X,val_y)], early_stopping_rounds=5,verbose=0)
        pred_y = lgbm_model.predict(val_X)
        in_fold_rmse = np.sqrt(np.mean((val_y-pred_y)**2))
        print('Fold %s rmse: %s\n' % (fold,in_fold_rmse))
        fold_error.append(in_fold_rmse)
        
    oof_rmse = np.sum(fold_error)/len(fold_error)
    print('E_cv: %s' % (oof_rmse))

    ##Train on the whole training data
    lgbm_model = LGBMRegressor(n_estimators=1000, learning_rate=0.1, 
                               max_depth=8, random_state=42,verbosity=-1)
    lgbm_model.fit(X,y,verbose=0)
    
    ##Plot feature importance
    fig,ax = plt.subplots(figsize=(10,8))
    lgbm.plot_importance(lgbm_model, ax=ax, importance_type='gain')
    plt.yticks(fontsize=14)
    
    return lgbm_model

In [None]:
lgbm_model = lightgbm_pipeline(data)

LightGBM baseline model performs better than XGBoost baseline model. On public leaderboard(0.70140 vs 0.70587).

# Optuna

## Tune XGBoost

In [None]:
def xgb_oof(trial,data):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [250, 300, 350, 400, 450]),
        "eta": trial.suggest_loguniform("eta",1e-2,1e-1),
        "max_depth": trial.suggest_categorical("max_depth",[6,8,10,12]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "min_child_weight": trial.suggest_int("min_child_weight",5,11),
        "random_state": 42
    }
        
    X = data.drop(columns='target',axis=1)
    y = data['target']

    kf = KFold(n_splits=5,shuffle=True,random_state=42)
    fold_error = []
    for fold,(train_idx,val_idx) in enumerate(kf.split(X)):
        train_X,val_X = X.iloc[train_idx,:],X.iloc[val_idx,:]
        train_y,val_y = y.iloc[train_idx],y.iloc[val_idx]
        
        xgb_model = XGBRegressor(**params)
        xgb_model.fit(train_X,train_y)
        pred_y = xgb_model.predict(val_X)
        in_fold_rmse = np.sqrt(np.mean((val_y-pred_y)**2))

        fold_error.append(in_fold_rmse)
        
    oof_rmse = np.sum(fold_error)/len(fold_error)
    return oof_rmse

In [None]:
def objective(trial):
    return xgb_oof(trial,data)

In [None]:
study = optuna.create_study(direction='minimize',study_name='XGBoost optimization')
study.optimize(objective, n_trials=10)

In [None]:
study.best_params

In [None]:
best_xgb = XGBRegressor(**(study.best_params))
best_xgb.fit(data.drop(columns='target',axis=1),data['target'])

## Tune LightGBM

In [None]:
def lgbm_oof(trial,data):
    params = {
        'num_leaves':trial.suggest_int('num_leaves',31,100),
        "n_estimators": trial.suggest_categorical("n_estimators", [250, 300, 350, 400, 450]),
        "eta": trial.suggest_loguniform("eta",1e-2,1e-1),
        "max_depth": trial.suggest_categorical("max_depth",[6,8,10,12]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "min_child_weight": trial.suggest_int("min_child_weight",5,11),
        'min_child_sample':trial.suggest_int('min_child_sample',20,50),
        "random_state": 42
    }
        
    X = data.drop(columns='target',axis=1)
    y = data['target']

    kf = KFold(n_splits=5,shuffle=True,random_state=42)
    fold_error = []
    for fold,(train_idx,val_idx) in enumerate(kf.split(X)):
        train_X,val_X = X.iloc[train_idx,:],X.iloc[val_idx,:]
        train_y,val_y = y.iloc[train_idx],y.iloc[val_idx]
        
        lgbm_model = LGBMRegressor(**params)
        lgbm_model.fit(train_X,train_y)
        pred_y = lgbm_model.predict(val_X)
        in_fold_rmse = np.sqrt(np.mean((val_y-pred_y)**2))

        fold_error.append(in_fold_rmse)
        
    oof_rmse = np.sum(fold_error)/len(fold_error)
    return oof_rmse

In [None]:
def objective(trail):
    return lgbm_oof(trail,data)

In [None]:
study = optuna.create_study(direction='minimize',study_name='LGBM optimization')
study.optimize(objective, n_trials=20)

In [None]:
study.best_params

In [None]:
best_lgbm = LGBMRegressor(**(study.best_params))
best_lgbm.fit(data.drop(columns='target',axis=1),data['target'])

# Ensemble two best models

In [None]:
best_xgb_param = {'n_estimators': 450,
                  'eta': 0.025241948026570656,
                  'max_depth': 10,
                  'subsample': 0.6,
                  'colsample_bytree': 0.6,
                  'min_child_weight': 7}

best_lgbm_param = {'num_leaves': 42,
                   'n_estimators': 400,
                   'eta': 0.07349402647118564,
                   'max_depth': 6,
                   'subsample': 1.0,
                   'colsample_bytree': 0.6,
                   'min_child_weight': 9,
                   'min_child_sample': 32}

In [None]:
def ensemble_pipeline(data,best_xgb,best_lgbm):
    X = data.drop(columns='target',axis=1)
    y = data['target']

    ##Caculate E_cv (cross validation error)
    kf = KFold(n_splits=5,shuffle=True,random_state=42)
    fold_error = []
    for fold,(train_idx,val_idx) in enumerate(kf.split(X)):
        train_X,val_X = X.iloc[train_idx,:],X.iloc[val_idx,:]
        train_y,val_y = y.iloc[train_idx],y.iloc[val_idx]
        
        print('Fold %s:'% (fold))
        ensemble_model = StackingRegressor(estimators = 
                                           [('best_xgb',XGBRegressor(**best_xgb_param)),
                                            ('best_lgbm',LGBMRegressor(**best_lgbm_param))],
                                           final_estimator = 
                                           GradientBoostingRegressor(n_estimators=200,
                                                                 random_state=42))
        ensemble_model.fit(train_X,train_y)
        pred_y = ensemble_model.predict(val_X)
        in_fold_rmse = np.sqrt(np.mean((val_y-pred_y)**2))
        print('Fold %s rmse: %s\n' % (fold,in_fold_rmse))
        fold_error.append(in_fold_rmse)
        
    oof_rmse = np.sum(fold_error)/len(fold_error)
    print('E_cv: %s' % (oof_rmse))

    ##Train on the whole training data
    ensemble_model = StackingRegressor(estimators = 
                                        [('best_xgb',best_xgb),
                                        ('best_lgbm',best_lgbm)],
                                        final_estimator = 
                                        GradientBoostingRegressor(n_estimators=200,
                                                                 random_state=42))
    ensemble_model.fit(X,y)
    
    return ensemble_model

In [None]:
best_xgb = XGBRegressor(**best_xgb_param)
best_lgbm = LGBMRegressor(**best_lgbm_param)
ensemble_model = ensemble_pipeline(data,best_xgb,best_lgbm)

# Submit Result

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
#pred_res = xgb_model.predict(test.loc[:,'cont1':])
#pred_res = best_xgb.predict(test.loc[:,'cont1':])
#pred_res = lgbm_model.predict(test.loc[:,'cont1':])
#pred_res = best_lgbm.predict(test.loc[:,'cont1':])
pred_res = ensemble_model.predict(test.loc[:,'cont1':])
submission = pd.DataFrame({'id':test.id,'target':pred_res})
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()