In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow, imread

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.stats as stats

import lightgbm as lgb
import warnings

In [None]:
R_SEED = 37

In [None]:
submit = True # for some testing

In [None]:
submission_ex = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

### Main idea
If we merge train data with test data and perform series of transformation on them, maybe we create additional bond between them. Just an idea, let's see what will happen.

In [None]:
targets_df = train_df[['loss']].copy()
train_df.drop(['id', 'loss'], axis=1, inplace=True) 
test_df.drop(['id'], axis=1, inplace=True) 

In [None]:
all_df = pd.concat([train_df, test_df])
# 1-------------------vvv

In [None]:
warnings.filterwarnings("ignore")

fig = plt.figure(figsize = (30,60))
ax = fig.gca()
hist = all_df.hist(bins = 50, layout = (20,5), color='k', alpha=0.5,  ax = ax)

#### step by step
I know that kmeans is not very good for this, but, its fast enough for just a try.

In [None]:
# warnings.filterwarnings("ignore")
def plot_fea_hist(fea_name):
    fig = plt.figure(figsize = (10, 10))
    ax = fig.gca()
    hist = all_df[fea_name].hist(bins=150, ax = ax)

#### KMeans

In [None]:
def plot_kmeans(data, labels, no_of_cl, fea_name, ax):
    ax.hist(data, 100, density = True)
    for cl in range(no_of_cl):
        ax.hist(data[labels == cl], 1, density = True, alpha = 0.5)
    ax.set_title(fea_name)
#     plt.show()

In [None]:
# guesswork
for_kmeans = [('f2', 2), ('f3', 3), ('f7', 3), ('f11', 3), ('f12', 2), ('f14', 2), ('f18', 2), ('f19', 2), ('f20', 4), ('f24', 3), 
              ('f26', 2), ('f27', 2), ('f32', 3), ('f34', 2), ('f38', 2), ('f48', 2), ('f50', 3), ('f57', 3), ('f67', 3), ('f76', 3),
             ('f80', 2), ('f86', 4), ('f93', 2), ('f94', 2)]

fig, axes = plt.subplots(nrows = 6, ncols = 4, figsize=(20, 30))
i = 1
for f, n_clusters in for_kmeans:
#     print(str(i) + ' of ' + str(len(for_kmeans)))
    
    # KMeans
    data = all_df[[f]].values
    km = KMeans(n_clusters = n_clusters, n_init = 50)
    km.fit(data)
    k_clus = km.labels_
    
    # print(km.cluster_centers_)
    # print(pd.value_counts(km.labels_))

    ax = axes[(i-1) // 4, (i-1) % 4]
    plot_kmeans(data, k_clus, n_clusters, f, ax)

    i += 1
    
    # # one_h_clus = np.zeros((k_clus.size, k_clus.max()+1))
    # # one_h_clus[np.arange(k_clus.size), k_clus] = 1
    # # for i in range(n_clusters):
    # #     all_df['clus_' + str(i)] = one_h_clus[:,i]

#     all_df[f + '_clus'] = k_clus
    _dist = km.transform(data)
    _dict = {f + '_dist_from_' + str(i): _dist[:,i] for i in range(n_clusters)}
    for k, v in _dict.items():
        all_df[k] = v
#     del all_df[f]
plt.show()

I created some new features as distance from cluster centroids for specified original features.

#### GaussianMixture

In [None]:
def plot_gmm(model, data, fea_name, ax):
    weights = model.weights_
    means = model.means_
    covars = model.covariances_

    n, bins, patches = ax.hist(data, 100, density = True, alpha = 0.2, color = 'k')
    x = np.arange(np.min(data), np.max(data), (np.max(data) - np.min(data)) / 100)
    for i in range(len(weights)):
        ax.plot(x, weights[i] * stats.norm.pdf(x,means[i],np.sqrt(covars[i])[0]), alpha = 0.7, linewidth = 3)
    ax.set_title(fea_name)
#     plt.show()

In [None]:
# guesswork
for_gmm = [('f2', 2), ('f4', 3), ('f6', 2), ('f7', 4), ('f8', 2), ('f9', 2), ('f11', 3), ('f12', 2), ('f14', 3), ('f15', 3),
          ('f16', 5), ('f18', 3), ('f19', 3), ('f21', 2), ('f22', 3), ('f24', 3), ('f26', 2), ('f28', 3), ('f30', 3), ('f32', 3),
          ('f34', 2), ('f35', 3), ('f37', 3), ('f38', 3), ('f39', 3), ('f41', 3), ('f52', 3), ('f64', 3), ('f67', 3), ('f68', 2),
          ('f69', 2), ('f73', 3), ('f75', 3), ('f76', 3), ('f77', 2), ('f79', 2), ('f80', 2), ('f81', 3), ('f85', 3), ('f89', 3),
          ('f91', 3), ('f93', 3), ('f94', 3)]

fig, axes = plt.subplots(nrows = 11, ncols = 4, figsize=(20, 50))

i = 1
for f, n_clusters in for_gmm:
#     print(str(i) + ' of ' + str(len(for_gmm)))
    
    # GMM
    data = all_df[[f]].values
    
    gm = GaussianMixture(n_components = n_clusters, n_init = 5)
    gm.fit(data)
    k_clus_1 = gm.predict(data)
    k_clus_2 = gm.predict_proba(data)

    ax = axes[(i-1) // 4, (i-1) % 4]
    plot_gmm(gm, data, f + '_clus_gmm', ax)
    i += 1
    
    all_df[f + '_clus_gmm'] = k_clus_1
    for j in range(len(k_clus_2[0])):
        all_df[f + '_clus_gmm_' + str(j)] = k_clus_2[:, j]

plt.show()

gmm features represent probability that value of original feature belongs to certain distribution

In [None]:
all_df.head()

In [None]:
# lightGBM will work same without this, but I try to transform data as much as I can before splitting them on trainset and testset
all_df_normalized = StandardScaler().fit_transform(all_df)
all_df = pd.DataFrame(all_df_normalized, columns=all_df.columns)

In [None]:
# KMeans on whole dataset
n_clusters = 2 # ?

data = all_df.values
km = KMeans(n_clusters = n_clusters, n_init = 50)
km.fit(data)
k_clus = km.labels_
print(pd.value_counts(km.labels_))

_dist = km.transform(data)
_dict = {'all_dist_from_' + str(i): _dist[:,i] for i in range(n_clusters)}
for k, v in _dict.items():
    all_df[k] = v

In [None]:
# 1-------------------^^^
train_df, test_df = all_df.iloc[:train_df.shape[0],:].copy(), all_df.iloc[-test_df.shape[0]:,:].copy()

In [None]:
if submit:
    X = train_df.copy()
    y = targets_df[['loss']].copy()
else:
    np.random.seed(R_SEED)
    msk = np.random.rand(len(train_df)) < 0.9
    X = train_df[msk].copy()
    my_X = train_df[~msk].copy()
    y = targets_df[msk].copy()
    my_y = targets_df[~msk].copy()

In [None]:
# qq_df = pd.concat([X, y], axis=1, join='inner')

# corr = qq_df.corr()

# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True

# fig = plt.figure(figsize = (30, 25))
# sns.heatmap(corr, cmap="flare", mask=mask)
# plt.show()

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis = 0)

In [None]:
def plot_fea_imp(model, model_name):
    print('Plotting feature importances...')
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': train_df.columns})
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False])#.iloc[-10:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 70), legend=None)
    plt.title('%s - Feature Importance' % (model_name))
    plt.ylabel('Features')
    plt.xlabel('Importance')

In [None]:
params_loss = {
                'n_estimators': 150000, # there is early_stopping
                'learning_rate': 0.002,
                'min_child_samples': 500,
                'feature_fraction': 0.35,
                'bagging_fraction': 0.8,
                'bagging_freq': 1,
                }

lgbm_reg = lgb.LGBMRegressor(
                            **params_loss, 
                            objective='rmse',
                            metric='rmse',
                            n_jobs=-1
                            )

#### interesting
prediction_2 is really weird, but on testing it gives best blend score (next cell)

In [None]:
import time
start_time = time.time()

_target = 'loss'

pred = []
val_rmse = []
all_rmse = 0

print(X.shape)

k = 5

kfolds = KFold(n_splits = k, shuffle = True, random_state = R_SEED) # 
_k = 1
for train_index, test_index in kfolds.split(X):
    print('------------------------------------------------k: ', _k, 'of', k)
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    lgbm_reg.fit(
        X_train, 
        y_train,
        eval_set = [(X_val, y_val)],
        verbose = 500,
        early_stopping_rounds = 3000,
#             callbacks = [lgb.reset_parameter(learning_rate = [0.005] * 500 + [0.001] * 29500)]
    )

    f_pred = lgbm_reg.predict(X_val)
    curr_rmse = mean_squared_error(y_val[_target].values, f_pred, squared = False)
    print('curr_rmse: ', curr_rmse, 'fold: ',_k, 'of', k)
    _k += 1
    val_rmse.append(curr_rmse)

    all_rmse += curr_rmse

    _p = lgbm_reg.predict(my_X if not submit else test_df)
    pred.append(_p)

    if not submit:
        test_rmse = mean_squared_error(my_y, _p, squared = False)
        print('test_rmse: ', test_rmse)

    print("Execution time: ", time.time() - start_time, "secs")
    
#     plot_fea_imp(lgbm_reg, 'LGBMRegressor')

print('all_rmse: ', all_rmse / len(pred))

    
pred1 = np.sum(pred, axis = 0) / len(pred)
pred2 = np.transpose(np.matmul(np.transpose(pred), softmax(val_rmse)))
pred3 = np.transpose(np.matmul(np.transpose(pred), softmax(np.squeeze(np.full((1, len(pred)), 100)) - val_rmse)))

if not submit:
    end_rmse1 = mean_squared_error(my_y, pred1, squared = False)
    print('end_rmse1: ', end_rmse1)
    end_rmse2 = mean_squared_error(my_y, pred2, squared = False)
    print('end_rmse2: ', end_rmse2)
    end_rmse3 = mean_squared_error(my_y, pred3, squared = False)
    print('end_rmse3: ', end_rmse3)
else:
    submission_1 = submission_ex[['id']].copy()
    submission_2 = submission_ex[['id']].copy()
    submission_3 = submission_ex[['id']].copy()
    submission_1[_target] = pred1
    submission_2[_target] = pred2
    submission_3[_target] = pred3
    submission_1.to_csv('submission_1.csv', index=False)
    submission_2.to_csv('submission_2_p.csv', index=False)
    submission_3.to_csv('submission_3.csv', index=False)