In [None]:
import pandas as pd 
import os, time  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from typing import Tuple, List, Union, Dict 

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
import umap 
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, RandomTreesEmbedding
from sklearn.metrics import log_loss, accuracy_score

from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.models import Sequential
import warnings 

warnings.simplefilter("ignore")

# Procedure method  
---  
  

1. KFold 
2. Stacking Layer1 
3. TSNE 
4. Clustering 
5. Stacking Layer2  
6. submit 

In [None]:
### ---> production dataset 
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

### ----> debug dataset 
# train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv", nrows=1000)
# test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv", nrows=1000)
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)
train.head()

In [None]:
test.head()

In [None]:
plt.pie(x = train.claim.value_counts().values, labels=[0, 1],
        counterclock=90, startangle=False, autopct="%1.1f%%")
plt.show()

In [None]:
train.isnull().sum()

In [None]:
is_nan = train.isnull().sum(axis=1)
df_is_nan = pd.DataFrame({"nan_cnt": is_nan.values, "claim": train.claim}, index=is_nan.index)
df_is_nan.groupby("claim").mean()

In [None]:
train["is_nan"] = train.isnull().sum(axis=1)
test["is_nan"] = test.isnull().sum(axis=1)

# Fill values 

In [None]:
cols = train.drop(["claim"], axis=1).columns 
for col in cols:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(train[col].mean())

# PCA 

In [None]:
s = MinMaxScaler()
train_s = s.fit_transform(train.drop("claim", axis=1))
test_s = s.transform(test)

pca = PCA(n_components=65, random_state=42) # explained ratio over 80.0%
pca_tr = pca.fit_transform(train_s)
pca_te = pca.transform(test_s)

plt.figure(figsize=(12, 6))
plt.plot(pca.explained_variance_ratio_.cumsum())
plt.xticks(np.arange(65).tolist())
plt.xlabel("n_components")
plt.ylabel("explained_variance_ratio_")
plt.grid()
plt.show()

In [None]:
y = train.claim.to_list()
train = pd.DataFrame(pca_tr, index=train.index, columns=[f"c{c}" for c in range(65)])
train["claim"] = y 
test = pd.DataFrame(pca_te, index=test.index, columns=[f"c{c}" for c in range(65)])

del pca_tr, pca_te 
train.head()

In [None]:
_, train = train_test_split(train, random_state=42, stratify=train.claim, test_size=0.3)
train.shape 

In [None]:
train = train.reset_index()
train.drop("index", axis=1, inplace=True)

# Cross-validation split 

In [None]:
def k_split(train: pd.DataFrame, k=4) -> pd.DataFrame:
    kf = StratifiedKFold(n_splits=k, random_state=42, shuffle=True)
    for i, (tr, va) in enumerate(kf.split(train, train.claim)):
        train.loc[va, "fold"] = int(i)
    train["fold"] = train.fold.astype(np.uint8)
    return train 

train = k_split(train)
train.head()

In [None]:
for i in range(4):
    value = train[train.fold == i]["claim"].value_counts().values
    index = train[train.fold == i]["claim"].value_counts().index
    plt.subplot(2, 2, i+1)
    plt.pie(x=value, labels=index, startangle=90, counterclock=False, autopct="%1.1f%%")
    plt.title(f"Fold{i}")
plt.tight_layout()

# Stacking_Models

In [None]:
def Net():
    input_size = 65
    model = Sequential()
    model.add(Dense(256, activation="relu", input_shape=(input_size, )))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model 

def model_layers(seed):
    clfs = []
    clfs.append(("XGBClassifier", Pipeline([
        ("XGB", XGBClassifier(n_jobs=-1, random_state=seed))
    ])))
#     clfs.append(("SVC", Pipeline([
#         ("SVC", SVC(random_state=seed))
#     ]))) 
    clfs.append(("DecisionTreeClassifier", Pipeline([
        ("DecisionTreeClassifier", DecisionTreeClassifier(random_state=seed))
    ]))) 
    clfs.append(("RandomForestClassifier", Pipeline([
        ("RandomForestClassifier", RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=seed))
    ]))) 
    clfs.append(("RidgeClassifier", Pipeline([
        ("RidgeClassifier", RidgeClassifier(random_state=seed))
    ]))) 
    clfs.append(("NN", Pipeline([
        ("NN", Net())
    ]))) 
    clfs.append(("ExtraTreesClassifier", Pipeline([
        ("ExtraTreesClassifier", ExtraTreesClassifier(n_jobs=-1, random_state=seed))
    ])))
    clfs.append(("BaggingRidgeClassifier",Pipeline([
        ("BaggingClassifier", BaggingClassifier(n_jobs=-1, random_state=42))
    ])))
    return clfs 


'''
Layer1:

predict validation -> Next train dataset 
predict test -> mean test 
'''

class Layer1():
    def __init__(self, seed=42):
        self.models = model_layers(seed)
        
    def train(self, train, test) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df_train, df_test = pd.DataFrame(), pd.DataFrame()
        fold_list = train["fold"].to_list()
        for name, model in self.models:
            train_ = train.copy()
            test_ = test.copy()
            predict_val, predict_test, val_index = [], [], []
            for fold in range(4):
                x_tr, x_va = train_[train_.fold != fold], train_[train_.fold == fold]
                x_train, y_train = x_tr.drop(["claim", "fold"], axis=1), x_tr[["claim"]]
                x_val, y_val = x_va.drop(["claim", "fold"], axis=1), x_va[["claim"]]
                x_train, x_val, x_test = self._transform(x_train, x_val, test_)
                
                pred_val, va_idx, pred_test = self._predict_cv(model, 
                                                               x_train,
                                                               y_train,
                                                               x_val, 
                                                               y_val,
                                                               x_test,
                                                               name)
                self._logs(pred_val, y_val, fold, name)
                predict_val.append(pred_val)
                predict_test.append(pred_test)
                val_index.append(va_idx)
            # concat predict valid 
            # mean predict test-data 
            va_idxs = np.concatenate(val_index)
            preds = np.concatenate(predict_val, axis=0)
            order = np.argsort(va_idxs)
            pred_train = preds[order] # Next train-dataset 
            pred_test = np.mean(predict_test, axis=0) # Next test-dataset 
            
            df_train[f"{name}_feature"] = pred_train 
            df_test[f"{name}_feature"] = pred_test 
        df_train["fold"] = fold_list
        df_train["claim"] = train["claim"].values.ravel()
        del pred_train, pred_test, x_train, x_val, y_train, y_val 
        return df_train, df_test 
    
    def _logs(self, pred_val, y_val, fold, name):
        loss = log_loss(pred_val, y_val.values.ravel())
        accuracy = accuracy_score(pred_val.ravel(), y_val.values.ravel())
        print(
            f"Model: {name} | Fold: {fold} | Loss: {loss:.5f} | Accuracy: {accuracy:.5f}"
        )
            
    def _predict_cv(self, model, x_train, y_train, x_val, y_val, test, name):
        model.fit(x_train, y_train)
        pred_val = model.predict(x_val)
        if name == "NN":
            pred_val = np.where(pred_val >= 0.5, 1, 0)
        pred_test = model.predict(test)
        if name == "NN":
            pred_test = np.where(pred_test >= 0.5, 1, 0)
        va_idx = x_val.index 
        return pred_val, va_idx, pred_test
        
    def _transform(self, train, val, test) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        '''StandardScaler train/val/test '''
        col = train.columns
        tr_idx, va_idx, te_idx = train.index, val.index, test.index 
        s = StandardScaler()
        train = s.fit_transform(train)
        val = s.transform(val)
        test = s.transform(test)
        train = pd.DataFrame(train, columns=col, index=tr_idx)
        val = pd.DataFrame(val, columns=col, index=va_idx)
        test = pd.DataFrame(test, columns=col, index=te_idx)
        return train, val, test 
    
'''
Layer2:
paramter chunning 
'''    

class Layer2():
    def __init__(self):
        self.param = {
            "max_depth": [1, 3, 5, 8, 10],
            "colsample_bytree": [1, 3, 5, 8, 10],
            "eta": [0.001, 0.01, 0.1, 1.0, 10.0]
        }
    
    def fit(self, train):
        train_ = train.copy()
        predict_proba = []
        for fold in range(4):
            start = time.time()
            tr, va = train_[train_.fold != fold], train_[train_.fold == fold]
            x_train, x_val = tr.drop(["fold", "claim"], axis=1), va.drop(["fold", "claim"], axis=1)
            y_train, y_val = tr[["claim"]], va[["claim"]]
            
            grid = GridSearchCV(XGBClassifier(random_state=42, silent=1),
                                param_grid=self.param,
                                cv=2).fit(x_train, y_train)
            model = XGBClassifier(random_state=42,
                                  silent=1, 
                                  **grid.best_params_,
                                  eval_set=[(x_train, y_train), (x_val, y_val)], 
                                  early_stopping_rounds=30).fit(x_train, y_train)
            pred_val = model.predict(x_val)
            proba = model.predict_proba(x_val)[:, 1]
            predict_proba.append(proba)
            self._logs(pred_val, y_val, fold, start)
            self._save(model, fold)
        predict_proba = np.concatenate(predict_proba, axis=0)
        del model, x_train, x_val, y_train, y_val, proba 
        return predict_proba 
            
    def _logs(self, pred_val, y_val, fold, start):
        loss = log_loss(pred_val, y_val.values.ravel())
        accuracy = accuracy_score(pred_val, y_val.values.ravel())
        print(
            f"Fold: {fold} | Loss: {loss:.5f} | Accuracy: {accuracy:.5f}|"
        )
        
    def _save(self, model, fold):
        os.makedirs("models", exist_ok=True)
        model.save_model(f"models/xgb{str(fold)}.model")
        print("successfully checkpoint model")
            
        
    

In [None]:
layer_first = Layer1()
layer_second = Layer2()

# Training Layer1

In [None]:
df_train, df_test = layer_first.train(train, test)

In [None]:
tr_col = df_train.drop(["fold", "claim"], axis=1).columns 
df_test = df_test[tr_col]
df_train.head()

# Clustering

In [None]:
pca = TSNE(n_components=2, random_state=42)
pca_tr, pca_te = pca.fit_transform(df_train.drop(["claim", "fold"], axis=1)), pca.fit_transform(df_test)

y = df_train["claim"]
plt.scatter(x=pca_tr[:, 0][y==0], y=pca_tr[:, 1][y==0], c="b")
plt.scatter(x=pca_tr[:, 0][y==1], y=pca_tr[:, 1][y==1], c="r")
plt.title("TSNE vs claim")
plt.grid()
plt.legend(["0", "1"])
plt.show()

In [None]:
km = KMeans(n_clusters=2, random_state=42)
tr_cluster = km.fit_predict(pca_tr)
te_cluster = km.predict(pca_te)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))
ax = axes.ravel()
for i in range(2):
    ax[i].scatter(pca_tr[:, 0][tr_cluster == i], pca_tr[:, 1][tr_cluster == i], c="b" if i == 0 else "r", alpha=0.8)
    ax[i].scatter(pca_tr[:, 0], pca_tr[:, 1], c=(0, 0, 0), alpha=0.1)
    ax[i].set_title(f"clsuter{i+1}")

In [None]:
df_train["cluster"] = tr_cluster 
df_test["cluster"] = te_cluster 

In [None]:
df_train[["cluster", "claim"]].corr()

In [None]:
df_train.head()

# Training Layer2

In [None]:
predict = layer_second.fit(df_train) # return predict proba 

In [None]:
sns.histplot(predict)
plt.title("Train")
plt.show()

# Submit 

In [None]:
def submittion(test):
    model = XGBClassifier(random_state=42)
    root_path = "models"
    filename = os.listdir(root_path)
    predict = []
    for fold in range(4):
        model.load_model(os.path.join(root_path, filename[fold]))
        pred = model.predict_proba(test)[:, 1]
        predict.append(pred.ravel())
    predict = np.mean(np.array(predict), axis=0)
    
    try:
        sub = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
        sub["claim"] = predict 
        sub.to_csv("submit.csv", index=False)
    finally:
        del model 
        return predict 
    

In [None]:
predict = submittion(df_test)

sns.histplot(predict)
plt.title("Test")
plt.show()