In [None]:
import numpy as np
import pandas as pd
import plotly as py
from statistics import mean
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from umap import UMAP

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold

import optuna

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import ExtraTreesClassifier

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate

np.random.seed(228)
tf.random.set_seed(228)

pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')
ss = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('MEMORY USAGE OF DATAFRAME IS: {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col]= df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('MEMORY USAGE AFTER OPTIMIZATION IS: {:.2f} MB'.format(end_mem))
    print('DECREASED BY: {:.1f} %'.format((start_mem - end_mem) / start_mem * 100))
    
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
features = train.columns.tolist()[0:-1]

def statistics(df):
    df['std'] = df[features].std(axis=1)
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    
statistics(train)
statistics(test)

features += ['std', 'min', 'max']

le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

sc = StandardScaler()

train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

X = train[features]
y = train['target']

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print('RUNNING ON TPU:', tpu.master)
    print(f'BATCH SIZE: {BATCH_SIZE}')
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 512
    print(f'RUNNING ON {strategy.num_replicas_in_sync} REPLICAS')
    print(f'BATCH SIZE: {BATCH_SIZE}')

In [None]:
def my_model():
    x_input = Input(shape=(X.shape[-1]), name='input')
    x_1 = Dense(256, activation='selu')(x_input)
    b_1 = BatchNormalization()(x_1)
    x_2 = Dense(128, activation='selu')(b_1)
    b_2 = BatchNormalization()(x_2)
    x_3 = Dense(128, activation='selu')(b_1)
    b_3 = BatchNormalization()(x_3)
    
    d_1 = Dropout(0.15)(Concatenate()([b_2, b_3]))
    x_4 = Dense(128, activation='relu')(d_1)
    b_4 = BatchNormalization()(x_4)
    x_5 = Dense(64, activation='selu')(b_4)
    b_5 = BatchNormalization()(x_5)
    x_6 = Dense(32, activation='selu')(b_5)
    b_6 = BatchNormalization()(x_6)
    
    output = Dense(10, activation='softmax', name='output')(b_6)
    
    model = tf.keras.Model(x_input, output, name='DNN_Model')
    
    return model

model = my_model()

In [None]:
plot_model(
    model,
    to_file='SUPER_MODEL.png',
    show_shapes=True,
    show_layer_names=True
)

In [None]:
X = X.values
y = y.values
test = test.values

In [None]:
VERBOSE = True
predictions, scores = [], []
skf = StratifiedKFold(n_splits=10, random_state=228, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    
    model = my_model()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
    
    lr = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=3, verbose=VERBOSE)
    es = EarlyStopping(monitor='val_loss', patience=7, verbose=VERBOSE, mode='min', restore_best_weights=True)
    
    save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    chk_point = ModelCheckpoint(f'./TPS1_model_2022_{fold + 1}C.h5', options=save_locally,
                                monitor='val_loss', verbose=VERBOSE,
                                save_best_only=True,
                                mode='min')
    
    model.fit(X_train,
              y_train,
              validation_data=(X_val, y_val),
              epochs=300,
              verbose=VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks=[lr, chk_point, es])
    
    load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
    model = load_model(f'./TPS1_model_2022_{fold + 1}C.h5', options=load_locally)
    
    y_pred = model.predict(X_val, batch_size=BATCH_SIZE)
    score = accuracy_score(y_val, np.argmax(y_pred, axis=1))
    scores.append(score)
    predictions.append(np.argmax(model.predict(test, batch_size=BATCH_SIZE), axis=1))
    print(f'FOLD: {fold + 1} | OOF SCORE: {score}')
    
print(f'MEAN ACCURACY ON {skf.n_splits} folds - {mean(scores)}')

In [None]:
retarget = {train['target'].value_counts().reset_index()['index'][i]: i for i in range(len(train['target'].value_counts()))}
retarget_2 = {i: key for key, i in retarget.items()}

In [None]:
ss['target'] = stats.mode(np.column_stack(predictions), axis=1)[0]
ss['target'] = ss['target'].map(retarget_2)
ss.to_csv('submission_03.csv', index=False)

In [None]:
predictions, scores = [], []
skf = StratifiedKFold(n_splits=10, random_state=228, shuffle=True)

for i, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    
    model = ExtraTreesClassifier(n_estimators=1111, n_jobs=-1)
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    val_score = accuracy_score(y_val, val_pred)
    print(f'FOLD: {i + 1} ACCURACY SCORE: {round(val_score, 4)}')
    
    scores.append(val_score)
    predictions.append(model.predict_proba(test))
    
print(f'\nMEAN ACCURACY: {round(mean(scores), 4)}')

In [None]:
y_proba = sum(predictions) / len(predictions)
y_proba += np.array([0, 0, 0.025, 0.045, 0, 0, 0, 0, 0, 0])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
# pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
ss['target'] = y_pred_tuned
ss.to_csv('submission_04.csv', index=False)

In [None]:
sub_1 = pd.read_csv('../input/forest-of-extra-trees-0-9895-up-to-4th-place/submission__.csv')
sub_2 = pd.read_csv('../input/forest-of-extra-trees-0-9895-up-to-4th-place/submission__blend.csv')
sub_3 = pd.read_csv('../input/forest-of-extra-trees-0-9895-up-to-4th-place/submission__blend_2.csv')

ss['target'] = le.fit_transform(ss['target'])
sub_1['target'] = le.transform(sub_1['target'])
sub_2['target'] = le.transform(sub_2['target'])
sub_3['target'] = le.transform(sub_3['target'])

blend_preds = []
for preds in [ss, sub_1, sub_2, sub_3]:
    blend_preds.append(preds['target'])
    
blend_ss = ss.copy()
blend_ss['target'] = le.inverse_transform(stats.mode(np.column_stack(blend_preds), axis=1)[0])

In [None]:
blend_ss.to_csv('submission_05.csv', index=False)