Mimicking the [work](https://www.kaggle.com/ivankontic/003-2-kmeans-and-gmm-feature-extraction-tps-aug) of last month winner @ivankontic for Sepetember dataset.

In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow, imread

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.stats as stats

import lightgbm as lgb
import warnings

In [None]:
R_SEED = 2017

In [None]:
submit = False # for some testing

In [None]:
submission_ex = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')

### Main idea
If we merge train data with test data and perform series of transformation on them, maybe we create additional bond between them. Just an idea, let's see what will happen.

In [None]:
targets_df = train_df[['claim']].copy()
train_df.drop(['id', 'claim'], axis=1, inplace=True) 
test_df.drop(['id'], axis=1, inplace=True) 

In [None]:
train_df = train_df.fillna(train_df.median())
test_df = test_df.fillna(test_df.median())

In [None]:
all_df = pd.concat([train_df, test_df])
# 1-------------------vvv

In [None]:
warnings.filterwarnings("ignore")

fig = plt.figure(figsize = (30,60))
ax = fig.gca()
hist = all_df.hist(bins = 50, layout = (24,5), color='k', alpha=0.5,  ax = ax)

#### step by step
I know that kmeans is not very good for this, but, its fast enough for just a try.

In [None]:
# warnings.filterwarnings("ignore")
def plot_fea_hist(fea_name):
    fig = plt.figure(figsize = (10, 10))
    ax = fig.gca()
    hist = all_df[fea_name].hist(bins=150, ax = ax)

#### KMeans

In [None]:
def plot_kmeans(data, labels, no_of_cl, fea_name, ax):
    ax.hist(data, 100, density = True)
    for cl in range(no_of_cl):
        ax.hist(data[labels == cl], 1, density = True, alpha = 0.5)
    ax.set_title(fea_name)
#     plt.show()

In [None]:
# guesswork
for_kmeans = [('f6', 2), ('f11', 2), ('f13', 2), ('f20', 3), ('f24', 2), ('f27', 3), ('f28', 3), ('f29', 3), ('f31', 3), 
              ('f38', 3), ('f40', 4), ('f42', 5), ('f47', 2), ('f49', 3), ('f54', 2), ('f56', 5), ('f61', 5), ('f65', 2), 
              ('f67', 5), ('f70', 2), ('f75', 4), ('f81', 8), ('f85', 4), ('f88', 4), ('f91', 3),  
              ('f99', 2), ('f100', 5), ('f109', 4)]

fig, axes = plt.subplots(nrows = 7, ncols = 4, figsize=(30, 60))
i = 1
for f, n_clusters in for_kmeans:
#     print(str(i) + ' of ' + str(len(for_kmeans)))
    
    # KMeans
    data = all_df[[f]].values
    km = KMeans(n_clusters = n_clusters, n_init = 50)
    km.fit(data)
    k_clus = km.labels_
    
    # print(km.cluster_centers_)
    # print(pd.value_counts(km.labels_))

    ax = axes[(i-1) // 4, (i-1) % 4]
    plot_kmeans(data, k_clus, n_clusters, f, ax)

    i += 1
    
    # # one_h_clus = np.zeros((k_clus.size, k_clus.max()+1))
    # # one_h_clus[np.arange(k_clus.size), k_clus] = 1
    # # for i in range(n_clusters):
    # #     all_df['clus_' + str(i)] = one_h_clus[:,i]

#     all_df[f + '_clus'] = k_clus
    _dist = km.transform(data)
    _dict = {f + '_dist_from_' + str(i): _dist[:,i] for i in range(n_clusters)}
    for k, v in _dict.items():
        all_df[k] = v
#     del all_df[f]
plt.show()

I created some new features as distance from cluster centroids for specified original features.

#### GaussianMixture

In [None]:
def plot_gmm(model, data, fea_name, ax):
    weights = model.weights_
    means = model.means_
    covars = model.covariances_

    n, bins, patches = ax.hist(data, 100, density = True, alpha = 0.2, color = 'k')
    x = np.arange(np.min(data), np.max(data), (np.max(data) - np.min(data)) / 100)
    for i in range(len(weights)):
        ax.plot(x, weights[i] * stats.norm.pdf(x,means[i],np.sqrt(covars[i])[0]), alpha = 0.7, linewidth = 3)
    ax.set_title(fea_name)
#     plt.show()

In [None]:
# guesswork
for_gmm = [('f11', 2), ('f13', 2), ('f23', 2), ('f24', 2), ('f38', 3), ('f47', 2), ('f49', 3),
           ('f54', 2), ('f55', 2), ('f58', 3), ('f59', 4), ('f60', 3), ('f65', 3), ('f66', 2), ('f94', 3), ('f99', 3), 
           ('f105', 2), ('f106', 5), ('f107', 4)]

fig, axes = plt.subplots(nrows = 5, ncols = 4, figsize=(30, 60))

i = 1
for f, n_clusters in for_gmm:
#     print(str(i) + ' of ' + str(len(for_gmm)))
    
    # GMM
    data = all_df[[f]].values
    
    gm = GaussianMixture(n_components = n_clusters, n_init = 50)
    gm.fit(data)
    k_clus_1 = gm.predict(data)
    k_clus_2 = gm.predict_proba(data)

    ax = axes[(i-1) // 4, (i-1) % 4]
    plot_gmm(gm, data, f + '_clus_gmm', ax)
    i += 1
    
    all_df[f + '_clus_gmm'] = k_clus_1
    for j in range(len(k_clus_2[0])):
        all_df[f + '_clus_gmm_' + str(j)] = k_clus_2[:, j]

plt.show()

gmm features represent probability that value of original feature belongs to certain distribution

In [None]:
all_df.head()

In [None]:
all_df.columns.values

In [None]:
# 1-------------------^^^
train_df, test_df = all_df.iloc[:train_df.shape[0],:].copy(), all_df.iloc[-test_df.shape[0]:,:].copy()

In [None]:
train_df.to_csv("trainfeat.csv", index=False)
test_df.to_csv("testfeat.csv", index=False)