## This notebook is derived from the following 4th place model created by Mohan. Many thanks!
https://www.kaggle.com/motchan/tps-oct-2021-4th-place-importantmodel-kmeans-nn

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
import datatable as dt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression

%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import tensorflow as tf
from tensorflow.keras import layers

import shap
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import gc

In [None]:
%%time
train = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv')
print(train.shape, test.shape)

In [None]:
memory_usage = train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()
print(start_mem)

In [None]:
%%time
feature_cols = [col for col in test.columns.tolist()]
useful_features = ["f22","f179","f69","f156","f58","f136","f214"]
n_clusters_1 = 12
cd_feature = True # cluster distance instead of cluster number
cluster_cols = [f"f{i+285}" for i in range(n_clusters_1)]
kmeans = KMeans(n_clusters=n_clusters_1, init="k-means++", max_iter=500, random_state=42)
print('before ',train.shape)
if cd_feature:
    # train
    X_cd = kmeans.fit_transform(train[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train.index)
    train = train.join(X_cd)
    # test
    X_cd = kmeans.transform(test[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test.index)
    test = test.join(X_cd)
    
else:
    # train
    train["cluster"] = kmeans.fit_predict(train[useful_features])
    # test
    test["cluster"] = kmeans.predict(test[useful_features])
    
    # one-hot encode
    ohe = OneHotEncoder()
    X_ohe = ohe.fit_transform(np.array(train["cluster"]).reshape(-1,1)).toarray()
    T_ohe = ohe.transform(np.array(test["cluster"]).reshape(-1,1)).toarray()

    X_ohe = pd.DataFrame(X_ohe, columns=cluster_cols, index=train.index)
    T_ohe = pd.DataFrame(T_ohe, columns=cluster_cols, index=test.index)

    train = pd.concat([train, X_ohe],axis=1)
    test = pd.concat([test, T_ohe],axis=1)

feature_cols += cluster_cols
print('after ', train.shape)
train.head()

In [None]:
def add_feature(df):
    df["f297"] = (df["f289"])/(df["f294"])
    df["f298"] = (df["f285"])/(df["f289"])
    df["f299"] = (df["f289"])/(df["f290"])
    df["f300"] = (df["f290"])/(df["f291"])
    df["f301"] = (df["f285"])/(df["f287"])
    df["f302"] = (df["f292"])/(df["f293"])
    df["f303"] = (df["f285"])/(df["f291"])
    return df
print('before ', train.shape)
new_features = ["f297","f298","f299","f300","f301","f302","f303"]
train = add_feature(train)
test = add_feature(test)
feature_cols += new_features
print('after ', train.shape)
train.head()

In [None]:
cnt_features =[]
cat_features =[]

for col in feature_cols:
    if train[col].dtype=='float64':
        cnt_features.append(col)
    else:
        cat_features.append(col)

In [None]:
%%time
bins = 128
n = 0
bins_list = []
bins_list.append(-np.inf)
for i in range(1,bins):
    n = n + 1./bins
    bins_list.append(n)
bins_list.append(np.inf)

labels = [i for i in range(bins)]
for col in cnt_features:
    train[col] = pd.cut(train[col], bins=bins_list, labels=labels)
    test[col] = pd.cut(test[col], bins=bins_list, labels=labels)
    
train.head()

In [None]:
train[feature_cols] = train[feature_cols].astype('uint8')
test[feature_cols] = test[feature_cols].astype('uint8')

In [None]:
memory_usage = train.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(end_mem, memory_usage.sum(), 100 * (end_mem - memory_usage.sum()) / end_mem))

# Now let's try to use Deep AutoViML to see how well we do in this competition

In [None]:
!pip install deep_autoviml

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
from deep_autoviml import deep_autoviml as deepauto

In [None]:
target = 'target'
keras_options = {'early_stopping': True}
model_options = {}
keras_model_type = 'fast1'
project_name = 'Oct_TPS21'

In [None]:
model, cat_vocab_dict = deepauto.fit(train, target, keras_model_type,
		project_name, keras_options=keras_options,  
		model_options=model_options, save_model_flag=True, use_my_model='',
		model_use_case='', verbose=1)

In [None]:
predictions = deepauto.predict(model, project_name, test_dataset=test,
                                 keras_model_type=keras_model_type, 
                                 cat_vocab_dict=cat_vocab_dict)

In [None]:
predictions[0][:,1]

In [None]:
preds = predictions[0][:,1]

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(x=preds.reshape(-1), kde=True, color="blue")
plt.title("Predictions Distribution")
plt.xlabel("Prediction")
plt.show()

In [None]:
### this basically flattens the column into a single array
np.squeeze(preds)

In [None]:
sample_submission['target'] = np.squeeze(preds)
sample_submission.to_csv("submission.csv", index=False)