In [None]:
import os, gc, logging, warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from scipy.spatial import ConvexHull
from scipy.spatial import Voronoi, voronoi_plot_2d

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans, OPTICS

import cudf

import tensorflow as tf
warnings.filterwarnings("ignore")


# Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv').set_index("row_id")
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv').set_index("row_id")

In [None]:
train.head()

In [None]:
# use label encoder and convert all values in float

feature_cols=test.columns.tolist()

le= LabelEncoder()
train['targtt']=le.fit_transform(train['target'])

for col in feature_cols:
    train[col]=train[col].astype("float32")
    test[col]=test[col].astype("float32")

In [None]:
train['target'].unique()

In [None]:
## handel duplicates

train=train.drop_duplicates(keep='first',ignore_index=True)
print(train.shape)



In [None]:
train.head()

# AutoEncoder for Dimensionality Reduction
AutoEncoder is an unsupervised Artificial Neural Network that attempts to encode the data by compressing it into the lower dimensions (bottleneck layer or code) and then decoding the data to reconstruct the original input. The bottleneck layer (or code) holds the compressed representation of the input data

![https://miro.medium.com/max/882/0*j9IZ2cJa2hS3TlOd.png](http://)

In [None]:
n_components = 2

tf.random.set_seed(42)
KI = tf.keras.initializers.glorot_uniform(seed=42)

encoder_input = tf.keras.layers.Input(shape=(train[feature_cols].shape[1],), name="Input")
encoder_x = tf.keras.layers.Dense(256, activation="relu", kernel_initializer=KI, name="Encoder")(encoder_input)
encoder_output = tf.keras.layers.Dense(n_components, activation="swish", kernel_initializer=KI, name="Bottleneck")(encoder_x)

decoder_x = tf.keras.layers.Dense(256, activation="relu", kernel_initializer=KI, name="Decoder")(encoder_output)
decoder_output = tf.keras.layers.Dense(train[feature_cols].shape[1], kernel_initializer=KI, name="Output")(decoder_x)

encoder = tf.keras.Model(encoder_input, encoder_output)
autoencoder = tf.keras.Model(encoder_input, decoder_output)

autoencoder.compile(optimizer="adam", loss="mse")

history = autoencoder.fit(train[feature_cols], train[feature_cols], epochs=5)

In [None]:
tf.keras.utils.plot_model(autoencoder,show_shapes=True,rankdir="TB")

In [None]:
enc_x=encoder.predict(train[feature_cols])
enc_t=encoder.predict(test[feature_cols])

enc_cols=[]
for i in range(n_components):
    col=f"ENC{i+1}"
    train[col] = enc_x[:,i]
    train[col] = enc_x[:,i]
    test[col] = enc_t[:,i]
    test[col] = enc_t[:,i]
    enc_cols.append(col)

    
del autoencoder, encoder, history, enc_x, enc_t
gc.collect()


In [None]:
train[enc_cols].head()

In [None]:
train.head()

In [None]:
df_x = train[["ENC1", "ENC2", "targtt"]].copy()
df_x["targtt"] = le.inverse_transform(df_x["targtt"])
df_t = test[["ENC1", "ENC2"]].copy()

plt.figure(figsize=(25,10))
sns.scatterplot(data=df_x, x="ENC1", y="ENC2", hue="targtt", alpha=0.8, palette="bright")
plt.title("AutoEncoder")
plt.show()

In [None]:
df_x.head()

In [None]:
sns.relplot(data=df_x,x="ENC1",y="ENC2",col="targtt"
          ,hue="targtt",col_wrap=5)
plt.show()


# Clustering

In [None]:
gc.collect()
n_clusters=10
kmeans=KMeans(n_clusters=n_clusters,random_state=0,max_iter=1000,n_init=100)

df_x["cluster"]=kmeans.fit_predict(train[enc_cols])
df_t["cluster"]=kmeans.predict(test[enc_cols])

In [None]:
df_t

In [None]:
centroids_list = kmeans.cluster_centers_

mosaic = """
AB
CC
"""
fig = plt.figure(constrained_layout=True, figsize=(25,20))
ax_dict = fig.subplot_mosaic(mosaic)

sns.scatterplot(data=df_x, x="ENC1", y="ENC2", hue="cluster", alpha=0.8, palette="bright", ax=ax_dict["A"])

In [None]:
train.head()

In [None]:
len(feature_cols)

# PCA

In [None]:
pca=PCA(n_components=2,random_state=43)
x_pca=pca.fit_transform(train[feature_cols])
t_pca=pca.fit_transform(test[feature_cols])


In [None]:
pca_cols=[f"PC{i+1}" for i in range(x_pca.shape[1])]
      
X_pca = pd.DataFrame(x_pca, columns=pca_cols, index=train.index)
T_pca = pd.DataFrame(t_pca, columns=pca_cols, index=test.index)         
          

In [None]:
train=pd.concat([train,X_pca],axis=1)
test=pd.concat([test,T_pca],axis=1)
del pca,X_pca,T_pca

train[pca_cols].head()

# Train Neural Network

In [None]:
feature_cols+=enc_cols
feature_cols+=pca_cols

gc.collect()

x_train,x_test,y_train,y_test=train_test_split(train[feature_cols],train["targtt"],test_size=0.2,random_state=34)

In [None]:
train.head()

In [None]:
KI=tf.keras.initializers.glorot_uniform(seed=231)
inputs=tf.keras.layers.Input(shape=(train[feature_cols].shape[1],))
x=tf.keras.layers.Dense(128,activation="relu",kernel_initializer=KI)(inputs)
x=tf.keras.layers.Dense(256,activation="relu",kernel_initializer=KI)(x)
x=tf.keras.layers.Dense(512,activation="relu",kernel_initializer=KI)(x)
x=tf.keras.layers.Dense(256,activation="relu",kernel_initializer=KI)(x)
x=tf.keras.layers.Dense(128,activation="relu",kernel_initializer=KI)(x)
x=tf.keras.layers.Dropout(0.2)(x)

outputs=tf.keras.layers.Dense(10,activation="softmax",kernel_initializer=KI)(x)
model=tf.keras.Model(inputs,outputs)

model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), 
              loss="sparse_categorical_crossentropy", 
              metrics=["accuracy"])

cb_es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4, mode="min", restore_best_weights=True, verbose=1)
cb_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, mode="min", min_lr=1e-5, verbose=1)

history = model.fit(x_train, 
                    y_train, 
                    validation_data=(x_test, y_test), 
                    epochs=50, 
                    callbacks=[cb_lr, cb_es])

# predictions

In [None]:
pred=np.argmax(model.predict(test[feature_cols]),axis=1)
pred

In [None]:
pred=le.inverse_transform(pred)


In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
sample_submission['target'] = pred

sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()