In [None]:
import pandas  as pd
import numpy as np
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn import datasets
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter("ignore")

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test_df  = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

In [None]:
columns = train_df.columns
print(columns)

In [None]:
train_df.head()

In [None]:
cat_features = [a for a in train_df.columns if a.startswith('cat')]

all_data = pd.concat([train_df[cat_features], test_df[cat_features]], axis=0)
print(all_data.shape)

In [None]:
for column in cat_features:
    temp = pd.get_dummies(pd.Series(all_data[column]), prefix=column, prefix_sep="_")
    temp_train = temp[0:train_df.shape[0]]
    temp_test = temp[train_df.shape[0]:]
    
    train_df = pd.concat([train_df,temp_train],axis=1)
    train_df = train_df.drop([column],axis=1)
    
    test_df = pd.concat([test_df,temp_test],axis=1)
    test_df = test_df.drop([column],axis=1)

In [None]:
train_df.shape, test_df.shape

In [None]:
features = [a for a in train_df.columns if a.startswith('c')]

In [None]:
print(features)

In [None]:
def create_stratified_folds_for_regression(data_df, n_splits=5):
    """
    @param data_df: training data to split in Stratified K Folds for a continous target value
    @param n_splits: number of splits
    @return: the training data with a column with kfold id
    """
    data_df['kfold'] = -1
    # randomize the data
    data_df = data_df.sample(frac=1).reset_index(drop=True)
    # calculate the optimal number of bins based on log2(data_df.shape[0])
    num_bins = np.int(np.floor(1 + np.log2(len(data_df))))
    print(f"Num bins: {num_bins}")
    # bins value will be the equivalent of class value of target feature used by StratifiedKFold to 
    # distribute evenly the classed over each fold
    data_df.loc[:, "bins"] = pd.cut(pd.to_numeric(data_df['target'], downcast="signed"), bins=num_bins, labels=False)
    kf = model_selection.StratifiedKFold(n_splits=n_splits)
    
    # set the fold id as a new column in the train data
    for f, (t_, v_) in enumerate(kf.split(X=data_df, y=data_df.bins.values)):
        data_df.loc[v_, 'kfold'] = f
    
    # drop the bins column (no longer needed)
    data_df = data_df.drop("bins", axis=1)
    
    return data_df

In [None]:
n_splits = 5
train_df = create_stratified_folds_for_regression(train_df, n_splits)

In [None]:
train_df.kfold.value_counts()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of target values")
sns.distplot(train_df['target'],color="darkblue", kde=True,bins=120, label='target')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of target values (StratifiedKFolds with bins)")
for k in range(0,n_splits):
    df = train_df.loc[train_df.kfold==k]
    sns.distplot(df['target'],kde=True,hist=False, bins=120, label=k)
plt.legend(); plt.show()

In [None]:
kf = model_selection.KFold(n_splits=n_splits)

plt.figure(figsize=(16,6))
plt.title("Distribution of target values (KFold)")
    
for f, (t_, v_) in enumerate(kf.split(X=train_df)):
    df = train_df.iloc[v_]
    sns.distplot(df['target'],kde=True,hist=False, bins=120, label=f)
    
plt.legend(); plt.show() 

In [None]:
def kfold_splits(n_splits, train_df):
    """
    Returns a collection of (fold, train indexes, validation indexes)
    @param n_splits: number of splits
    @param train_df: training data
    @return: a collection of (fold, train indexes, validation indexes)
    """
    all_folds = list(range(0, n_splits))
    kf_splits = []
    for fold in range(0, n_splits):
        train_folds = [x for x in all_folds if x != fold]
        trn_idx = train_df[train_df.kfold!=fold].index
        val_idx = train_df[train_df.kfold==fold].index
        kf_splits.append((fold, trn_idx, val_idx))
    return kf_splits

In [None]:
params = {'objective': 'regression',
 'metric': 'rmse',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'feature_pre_filter': False,
  'learning_rate': 0.0035,
 'lambda_l1': 18.42,
 'lambda_l2': 4.02,
 'num_leaves': 128,
 'min_data_in_leaf': 81,
 'sub_feature': 0.5,
 'sub_row': 0.8,
 'subsample_freq': 10}

In [None]:
y = train_df['target']
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
random_state = 42
num_round = 15000
for fold, trn_idx, val_idx in kfold_splits(n_splits, train_df):
    print(f"fold: {fold}, train len: {len(trn_idx)}, val len: {len(val_idx)}")
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=y.iloc[val_idx])
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 500)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / n_splits
print(f'CV score: {np.round(mean_squared_error(y, oof, squared=False),5)}')

In [None]:
submission = pd.DataFrame({"id":test_df.id, "target":predictions})
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()