# Generating various bimodal predictions

Sharing some of my predictions that displayed bimodal distribution. Public scores are around 0.70. Tried some ensembles but public score isn't necessarily any better than "unimodal" predictions..

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly_express as px
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Distribution of target in train set

In [1]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')

In [1]:
fig = ff.create_distplot(
    [train.target[:10000]], 
    group_labels = ['kde']
)

fig.update_xaxes(range=[5, 10])

fig

# Distribution of predictions for test set (ensemble, public score = 0.69685 )

In [1]:
df_69685 = pd.read_csv('../input/dataset/0.69685-others-submission6.csv')
fig = ff.create_distplot(
    [df_69685.target[:10000]], 
    group_labels = ['kde']
)

fig.update_xaxes(range=[5, 10])

# Simulating the shape of target in the train set

I trained 5 additional models: 

- XGBRegressor,  using train set where target is <9. 
- Same XGBRegressor model, except using train set where target >7
- LightGBM, using train set where target is <9. 
- Same LightGBM model, except using train set where target >7
- CatBoostRegressor, using train set where target is <9. 
- Same CatBoostRegressor model, except using train set where target >7

All 6 models were tuned using Optuna. Each model has an RMSE of bewteen 0.3-0.5 for prediction within its specified region. Then I used each of the models to predict the **full** test set

In [1]:
# light GBM optuna tuned params
light_opt_best_random = {'n_estimators': 8540,
 'min_data_per_group': 45,
 'num_leaves': 126,
 'max_depth': 20,
 'learning_rate': 0.0032598944879946414,
 'subsample_for_bin': 32553,
 'lambda_l1': 0.11917413918151999,
 'lambda_l2': 6.857359561808505e-05,
 'bagging_fraction': 0.8910429482743759,
 'min_data_in_leaf': 94,
 'min_sum_hessian_in_leaf': 0.01,
 'bagging_freq': 2,
 'feature_fraction': 0.4699812049606955,
 'min_child_samples': 61}

# xgboost upotuna tuned params
xgb_opt_best_random = {'learning_rate': 0.004138539806617361, 
 'gamma': 0.020496820582462844, 
 'max_depth': 19, 
 'min_child_weight': 308, 
 'max_delta_step': 9, 
 'subsample': 0.6437442427644592, 
 'colsample_bytree': 0.41845630929589844, 
 'lambda': 0.0038484657676066394, 
 'alpha': 0.09281553090596092, 
 'n_estimators': 4767
}

# catboost optuna tuned params
catboost_opt_best_random= {
    'n_estimators': 9639, 
    'learning_rate': 0.025621857270512527, 
    'reg_lambda': 0.03261099593456338, 
    'subsample': 0.6319711159148579, 
    'depth': 7, 
    'min_child_samples': 48, 
    'colsample_bylevel': 0.14898612913306458, 
    'langevin': False, 
    'model_shrink_rate': 0.28621265987632455, 
    'model_shrink_mode': 'Decreasing', 
    'model_size_reg': 2.373053070327802
}

In [1]:
# loading predictions for the 6 models above
sub26a = pd.read_csv('../input/dataset/submission26a-optuna-light-sm.csv')
sub26b = pd.read_csv('../input/dataset/submission26b-optuna-light-lg.csv')
sub26c = pd.read_csv('../input/dataset/submission26c-optuna-xgb-sm.csv')
sub26d = pd.read_csv('../input/dataset/submission26d-optuna-xgb-lg.csv')
sub26e = pd.read_csv('../input/dataset/submission26e-optuna-catboost-sm.csv')
sub26f = pd.read_csv('../input/dataset/submission26f-optuna-catboost-lg.csv')

# Generate some bi-modal predictions

## Mix 1 - public score = 0.7034

In [1]:
ls = []
for idx, num in enumerate(df_69685.target):
    
    if np.mean([sub26b.target[idx], sub26d.target[idx], sub26f.target[idx]]) > 8.1:# change this threshold or swap the if/elif clause, you will get different shapes
        ls.append(np.mean([sub26b.target[idx], sub26d.target[idx], sub26f.target[idx]]))
    elif np.mean([sub26a.target[idx], sub26c.target[idx], sub26e.target[idx]]) < 8.1:
        ls.append( np.mean([sub26a.target[idx], sub26c.target[idx], sub26e.target[idx]]))

    else:
        ls.append(num)


In [1]:

fig = ff.create_distplot(
    [ls[:1000]], 
    group_labels = ['kde']
)

fig.update_xaxes(range=[5, 10])

## Mix 2 - public score = 0.70714

In [1]:
ls = []
for idx, num in enumerate(df_69685.target):
    
    if np.mean([sub26a.target[idx], sub26c.target[idx], sub26f.target[idx]]) < 7.8: # change this threshold or swap the if/elif clause, you will get different shapes
        ls.append( np.mean([sub26a.target[idx], sub26c.target[idx], sub26f.target[idx]]))
    
    elif np.mean([sub26b.target[idx], sub26d.target[idx], sub26e.target[idx]]) > 7.8 :
        ls.append(np.mean([sub26b.target[idx], sub26d.target[idx], sub26e.target[idx]]))
    else:
        ls.append(num)

In [1]:
fig = ff.create_distplot(
    [ls[:10000]], 
    group_labels = ['kde']
)

fig.update_xaxes(range=[5, 10])

So in theory you can generate predictions with bimodal distributions, but their does not seem to be as good as the best model I have, which only has 1 peak. Any thoughts?