In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()

In [None]:
cor_1 = train.corr()
cor_1.head()
cor_1.shape

In [None]:
cor_target = cor_1.loc['target':'target']
cor_2 = cor_target.drop(['id','target'],axis=1)
cor_3 = abs(cor_2)
cor_4 = cor_3.sort_values(by='target',axis=1, ascending=False)
pd.set_option('display.max_rows', 1)
pd.set_option('display.max_columns', 285)
cor_4.head()

Columns with a correlation of "target > 0.1" are used to generate new features using the KMeans method.

KMeans

In [None]:
useful_features = ['f34','f55','f43','f71','f80','f91','f8','f27']
distortions=[]
for i in range(1,15):
    km = KMeans(n_clusters=i,
              init="k-means++",
              max_iter=500,
              random_state=42)
    km.fit(train[useful_features])
    distortions.append(km.inertia_)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,15),distortions,marker="o")
plt.xticks(range(1,15))
plt.xlabel("Number of clusters")
plt.ylabel("Distortion")
plt.show()

In [None]:
%%time
feature_cols = [col for col in test.columns.tolist()]
useful_features = ['f34','f55','f43','f71','f80','f91','f8','f27']
n_clusters = 8
cd_feature = True # cluster distance instead of cluster number
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", max_iter=500, random_state=42)

if cd_feature:
    # train
    X_cd = kmeans.fit_transform(train[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train.index)
    train = train.join(X_cd)
    # test
    X_cd = kmeans.transform(test[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test.index)
    test = test.join(X_cd)
    
else:
    # train
    train["cluster"] = kmeans.fit_predict(train[useful_features])
    # test
    test["cluster"] = kmeans.predict(test[useful_features])
    
    # one-hot encode
    ohe = OneHotEncoder()
    X_ohe = ohe.fit_transform(np.array(train["cluster"]).reshape(-1,1)).toarray()
    T_ohe = ohe.transform(np.array(test["cluster"]).reshape(-1,1)).toarray()

    X_ohe = pd.DataFrame(X_ohe, columns=cluster_cols, index=train.index)
    T_ohe = pd.DataFrame(T_ohe, columns=cluster_cols, index=test.index)

    train = pd.concat([train, X_ohe],axis=1)
    test = pd.concat([test, T_ohe],axis=1)

feature_cols += cluster_cols
train.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
fig = plt.figure(figsize = (10,5))

if cd_feature:
    sns.kdeplot(data=train[cluster_cols])
else:
    ax = sns.countplot(data=train, x='cluster', hue="target")
    for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=5)

plt.show()

In [None]:
def add_feature(df):
    df["new_f1"] = (df["cluster1"])/(df["cluster8"])
    df["new_f2"] = (df["cluster1"])/(df["cluster7"])
    df["new_f3"] = (df["cluster1"])/(df["cluster6"])
    df["new_f4"] = (df["cluster1"])/(df["cluster5"])
    df["new_f5"] = (df["cluster1"])/(df["cluster4"])
    df["new_f6"] = (df["cluster1"])/(df["cluster3"])
    df["new_f7"] = (df["cluster1"])/(df["cluster2"])
    df["new_f8"] = (df["cluster2"])/(df["cluster8"])
    df["new_f9"] = (df["cluster2"])/(df["cluster7"])
    df["new_f10"] = (df["cluster2"])/(df["cluster6"])
    df["new_f11"] = (df["cluster2"])/(df["cluster5"])
    df["new_f12"] = (df["cluster2"])/(df["cluster4"])
    df["new_f13"] = (df["cluster2"])/(df["cluster3"])
    df["new_f14"] = (df["cluster3"])/(df["cluster8"])
    df["new_f15"] = (df["cluster3"])/(df["cluster7"])
    df["new_f16"] = (df["cluster3"])/(df["cluster6"])
    df["new_f17"] = (df["cluster3"])/(df["cluster5"])
    df["new_f18"] = (df["cluster3"])/(df["cluster4"])
    df["new_f19"] = (df["cluster4"])/(df["cluster8"])
    df["new_f20"] = (df["cluster4"])/(df["cluster7"])
    df["new_f21"] = (df["cluster4"])/(df["cluster6"])
    df["new_f22"] = (df["cluster4"])/(df["cluster5"])
    df["new_f23"] = (df["cluster5"])/(df["cluster8"])
    df["new_f24"] = (df["cluster5"])/(df["cluster7"])
    df["new_f25"] = (df["cluster5"])/(df["cluster6"])
    df["new_f26"] = (df["cluster6"])/(df["cluster8"])
    df["new_f27"] = (df["cluster6"])/(df["cluster7"])
    df["new_f28"] = (df["cluster7"])/(df["cluster8"])
    return df

new_features = ["new_f1","new_f2","new_f3","new_f4","new_f5","new_f6","new_f7","new_f8","new_f9","new_f10"\
               ,"new_f11","new_f12","new_f13","new_f14","new_f15","new_f16","new_f17","new_f18","new_f19","new_f20"\
               ,"new_f21","new_f22","new_f23","new_f24","new_f25","new_f26","new_f27","new_f28"]
train = add_feature(train)
test = add_feature(test)
feature_cols += new_features
train.head()

In [None]:
%%time
from sklearn.feature_selection import mutual_info_regression
x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['target'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
import plotly.figure_factory as ff
import plotly.express as px
top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

'cluster8'\
'new_f12' = (cluster2)/(cluster4)\
'new_f1' = (cluster1)/(cluster8)\
 top3 strong relationship\
 good luck!