In [None]:
import numpy as np 
import pandas as pd
import plotly as py
from statistics import mean
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from umap import UMAP

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold

import optuna

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import ExtraTreesClassifier

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate

np.random.seed(228)
tf.random.set_seed(228)

pd.set_option('display.max_columns', None)

#########################################################

train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')
ss = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

#########################################################

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# EDA

**Basic information**

In [None]:
def info(data):
    print(f'Length of data: {len(data)}')
    print('')
    
    x = pd.Series([data[i].dtypes for i in data.columns.tolist()])
    print(x.value_counts().to_frame().reset_index().rename(columns={0: 'count', 'index': 'type'}))
    print('')
    
    if data.isna().sum().sum() > 0:
        print(f'Missing values: {data.isna().sum().sum()} ({round(data.isna().sum().sum()/len(data)*100, 2)}%)')
    else:
        print(f'Missing values: False')
    print('')
    
    if data.duplicated().sum() > 0:
        print(f'Duplicated values: {data.duplicated().sum()} ({round(data.duplicated().sum()/len(data)*100, 2)}%)')
    else:
        print(f'Duplicated values: False')
    print('')
    
    try:
        print(f'Unique target values: {data["target"].nunique()}')
    except:
        pass
    
print('TRAINING DATASET INFORMATION')
print('')
info(train)
print('--------------------------------------')
print('TEST DATASET INFORMATION')
print('')
info(test)

There are a large number of duplicates in the data, they need to be dropped, but try take into account the important idea of [AmbrosM](https://www.kaggle.com/ambrosm) and add weights to duplicate observations.

In [None]:
# vc = train.value_counts()
# train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=train.columns)
# train['sample_weight'] = vc.values

But I will also try without dropping duplicates and with dropping duplicates, but without sample weights.

In [None]:
#train.drop_duplicates(inplace=True)

**Features**

In [None]:
pd.set_option('display.max_rows', None)
train[train.columns.tolist()[0:-1]].describe().transpose().sort_values('mean')[['mean', 'std', 'min', 'max']].style.background_gradient(cmap='Blues')

**Target**

In [None]:
pd.options.display.max_rows = 60

plt.figure(figsize = (12, 4))
sns.set_style("white")
plt.title('Distribution of target without dropping duplicates', fontname = 'monospace', fontsize = 20, color = '#313233', x = 0.5, y = 1.1)
a = sns.barplot(data = train['target'].value_counts().reset_index(), x = 'target', y = 'index',\
                palette = (['#63a2eb', '#5ca0ed', '#5299eb', '#4a96ed', '#4091ed', '#378ced', '#2f88ed', '#2482ed', '#1a7ded', '#0e75eb'][::-1]),\
                linestyle = "-", linewidth = 1, edgecolor = "black")
plt.xticks([])
plt.yticks(fontname = 'monospace', size = 12, color = '#313233')
plt.xlabel('')
plt.ylabel('')

for j in ['right', 'top', 'left', 'bottom']:
    a.spines[j].set_visible(False)
    
for p in a.patches:
    width = p.get_width()
    plt.text(800 + width, p.get_y() + 0.55*p.get_height(), f'{round((width / len(train)) * 100, 2)}%',
             ha = 'center', va = 'center', fontname = 'monospace', fontsize = 12, color = '#313233')
        
plt.show()

In [None]:
plt.figure(figsize = (12, 4))
sns.set_style("white")
plt.title('Distribution of target with dropping duplicates', fontname = 'monospace', fontsize = 20, color = '#313233', x = 0.5, y = 1.1)
a = sns.barplot(data = train['target'].value_counts().reset_index(), x = 'target', y = 'index',\
                palette = (['#63a2eb', '#5ca0ed', '#5299eb', '#4a96ed', '#4091ed', '#378ced', '#2f88ed', '#2482ed', '#1a7ded', '#0e75eb'][::-1]),\
                linestyle = "-", linewidth = 1, edgecolor = "black")
plt.xticks([])
plt.yticks(fontname = 'monospace', size = 12, color = '#313233')
plt.xlabel('')
plt.ylabel('')

for j in ['right', 'top', 'left', 'bottom']:
    a.spines[j].set_visible(False)
    
for p in a.patches:
    width = p.get_width()
    plt.text(550 + width, p.get_y() + 0.55*p.get_height(), f'{round((width / len(train)) * 100, 2)}%',
             ha = 'center', va = 'center', fontname = 'monospace', fontsize = 12, color = '#313233')
        
plt.show()

In [None]:
retarget = {train['target'].value_counts().reset_index()['index'][i]: i for i in range(len(train['target'].value_counts()))}
retarget2 = {i: k for k, i in retarget.items()}
train['target'] = train['target'].map(retarget)

umap = UMAP(n_components = 2, n_neighbors = 10, min_dist = 0.99).fit_transform(train.drop('target', axis = 1).sample(15000, random_state = 228), train['target'].sample(15000, random_state = 228))

plt.figure(figsize=(15, 12))
plt.title('Target UMAP', size = 25, y = 1.03, fontname = 'monospace')
scu = sns.scatterplot(x = umap[:, 0], y = umap[:, 1], hue = train['target'].sample(15000, random_state = 228), s = 5, edgecolor = 'none', alpha = 0.8,\
                     palette = ['#d15858', '#6b1e1e', '#7f2d7d', '#805ead', '#406eb3', '#31a2c4', '#3da69a', '#3da65b', '#4c822b', '#a89d38'])
plt.xticks([])
plt.yticks([])
for i in ['right', 'left', 'top']:
    scu.spines[i].set_visible(False)
plt.legend(ncol = 2, borderpad = 1, frameon = True, fontsize = 11)
scu.text(12, -23, '''n_components = 2
n_neighbors = 10
min_dist = 0.99''', fontname = 'monospace', fontsize = 12)
plt.legend(labels = list(retarget.keys()), title = "Bacteria species", ncol = 2, borderpad = 1, frameon = False, fontsize = 12, bbox_to_anchor = (0.5, 0))
plt.show()

# Preprocessing

In [None]:
features = train.columns.tolist()[0:-1]
# sample_weight = train['sample_weight']

train['std'] = train[features].std(axis = 1)
test['std'] = test[features].std(axis = 1)

train['min'] = train[features].min(axis = 1)
test['min'] = test[features].min(axis = 1)

train['max'] = train[features].max(axis = 1)
test['max'] = test[features].max(axis = 1)

features += ['std', 'min', 'max']

le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

X = train[features]
y = train['target']

# XGB

I don't think that the results of LGBM or CB would be much different, so I decided to try XGB. The parameters were searched for of course with Optuna. For all trials I used the same parameters.

1. First trial without dropping duplicates. LB result - **0.95025**.

2. Second trial only with dropping duplicates. LB result - **0.94678**

3. Third trial with dropping duplicates and sample weights. LB result - **0.94101**.

In the following models I will not try dropping duplicates and using sample weights. 

In [None]:
paramsXGB = {'max_depth': 8,
             'learning_rate': 0.2478225904887278, 
             'min_child_weight': 8, 
             'gamma': 0.018329940112279165, 
             'alpha': 0.00019394894279195157, 
             'lambda': 0.06161761858777205, 
             'colsample_bytree': 0.6721122683333417, 
             'subsample': 0.6155733760919804,
             'n_estimators': 3000,
             'tree_method': 'gpu_hist',
             'booster': 'gbtree',
             'random_state': 228,
             'use_label_encoder': False,
             'objective': 'multi:softmax',
             'eval_metric': 'mlogloss',
             'predictor': 'gpu_predictor'}

In [None]:
predictions, scores = [], []
 
k = StratifiedKFold(n_splits = 10, random_state = 228, shuffle = True)
for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBClassifier(**paramsXGB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 30)
    
    val_pred = model.predict(X_val)
    val_score = accuracy_score(y_val, val_pred)
    print(f'Fold {i+1} accuracy score: {round(val_score, 4)}')
    
    scores.append(val_score)
    predictions.append(model.predict(test))
print('')    
print(f'Mean accuracy - {round(mean(scores), 4)}')

In [None]:
ss['target'] = stats.mode(np.column_stack(predictions), axis=1)[0]
ss['target'] = ss['target'].map(retarget2)
ss.to_csv('submission.csv', index=False)

# NN

1. First trial without dropping duplicates. In less than 50 epochs each fold of data has reached ~0.993 accuracy. Mean accuracy on 10 folds - **0.9961**. To avoid overfitting I tried low patience, but the LB result is very bad - **0.9124**
2. Second trial with dropping duplicates. In less than 30 epochs each fold of data has reached ~0.97 accuracy. Mean accuracy on 10 folds - **0.971**. LB result - **0.8944**

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 512
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

In [None]:
def my_model():
    x_input = Input(shape=(X.shape[-1]), name="input")
    x1 = Dense(256, activation='selu')(x_input)
    b1 = BatchNormalization()(x1)
    x2 = Dense(128, activation='selu')(b1)
    b2 = BatchNormalization()(x2)
    x3 = Dense(128, activation='selu')(b1)
    b3 = BatchNormalization()(x3)
    
    d1 = Dropout(0.15)(Concatenate()([b2, b3]))
    x4 = Dense(128, activation='relu')(d1) 
    b4 = BatchNormalization()(x4)
    x5 = Dense(64, activation='selu')(b4)
    b5 = BatchNormalization()(x5)
    x6 = Dense(32, activation='selu')(b5)
    b6 = BatchNormalization()(x6)
    output = Dense(10, activation="softmax", name="output")(b6)
    
    model = tf.keras.Model(x_input, output, name='DNN_Model')
    return model

model = my_model()

In [None]:
plot_model(
    model, 
    to_file='Super_Model.png', 
    show_shapes=True,
    show_layer_names=True
)

In [None]:
X = X.values
y = y.values
test = test.values

In [None]:
VERBOSE = True
predictions, scores = [], []
k = StratifiedKFold(n_splits = 10, random_state = 228, shuffle = True)

    
for fold, (train_idx, test_idx) in enumerate(k.split(X, y)):
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]
        
    model = my_model()
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics="accuracy")

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.6, 
                               patience=3, verbose=VERBOSE)

    es = EarlyStopping(monitor="val_loss", patience=7, 
                           verbose=VERBOSE, mode="min", 
                           restore_best_weights=True)
        
    save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    chk_point = ModelCheckpoint(f'./TPS1_model_2022_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_loss', verbose=VERBOSE, 
                                    save_best_only=True, mode='min')
        
    model.fit(X_train, y_train, 
                  validation_data=(X_val, y_val), 
                  epochs=300,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
    load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
    model = load_model(f'./TPS1_model_2022_{fold+1}C.h5', options=load_locally)
        
    y_pred = model.predict(X_val, batch_size=BATCH_SIZE)
    score = accuracy_score(y_val, np.argmax(y_pred, axis=1))
    scores.append(score)
    predictions.append(np.argmax(model.predict(test, batch_size=BATCH_SIZE), axis=1))
    print(f"Fold-{fold+1} | OOF Score: {score}")
    
print(f'Mean accuracy on {k.n_splits} folds - {mean(scores)}')

In [None]:
ss['target'] = stats.mode(np.column_stack(predictions), axis=1)[0]
ss['target'] = ss['target'].map(retarget2)
ss.to_csv('submission.csv', index=False)

# Extra trees

I knew that in this competition Extra Trees has a big advantage over the others, but I wanted to try one of gradient trees and simple NN. Soo.. 

Trials without dropping duplicates:
1. First trial with 300 estimators and no limit in depth. Mean accuracy on 10 folds - **0.9962**. LB result - **0.97816**
2. Second trial with 1000 estimators and no limit in depth. Mean accuracy on 10 folds - **0.9966**. LB result - **0.97796**

Trials with dropping duplicates:

1. First trial with 300 estimators and no limit in depth. Mean accuracy on 10 folds - **0.9755**. LB result - **0.97700**
2. Second trial with 1000 estimators and no limit in depth. Mean accuracy on 10 folds - **0.9778**. LB result - **0.97771**

Let's try postprocessing on predictions without dropping duplicates:

LB result - **0.98890**

In [None]:
predictions, scores = [], []
 
k = StratifiedKFold(n_splits = 10, random_state = 228, shuffle = True)
for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = ExtraTreesClassifier(n_estimators=1111, n_jobs=-1)
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    val_score = accuracy_score(y_val, val_pred)
    print(f'Fold {i+1} accuracy score: {round(val_score, 4)}')
    
    scores.append(val_score)
    predictions.append(model.predict_proba(test))
print('')    
print(f'Mean accuracy - {round(mean(scores), 4)}')

In [None]:
y_proba = sum(predictions) / len(predictions)
y_proba += np.array([0, 0, 0.025, 0.045, 0, 0, 0, 0, 0, 0])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
ss['target'] = y_pred_tuned
ss.to_csv('submission__.csv', index=False)

# Forest of Extra Trees

Thanks for the results to: [Marco](https://www.kaggle.com/marcobr95/predicting-bacteria-species/notebook?scriptVersionId=87489264), [Safak](https://www.kaggle.com/sfktrkl/tps-feb-2022/notebook?scriptVersionId=87059305) and [Alex](https://www.kaggle.com/alexryzhkov/tps-feb-22-lightautoml-pseudolabel)

LB result - **0.9895**

In [None]:
sub1 = pd.read_csv('../input/tps-feb-results/alex_submission 0.98865.csv')
sub2 = pd.read_csv('../input/tps-feb-results/safak_submission 0.98830.csv')
sub3 = pd.read_csv('../input/tps-feb-results/marco_submission 0.98790.csv')

ss['target'] = le.fit_transform(ss['target'])
sub1['target'] = le.transform(sub1['target'])
sub2['target'] = le.transform(sub2['target'])
sub3['target'] = le.transform(sub3['target'])

blend_preds = []
for preds in [ss, sub1, sub2, sub3]:
    blend_preds.append(preds['target'])
    
blend_ss = ss.copy()
blend_ss['target'] = le.inverse_transform(stats.mode(np.column_stack(blend_preds), axis=1)[0])

In [None]:
blend_ss.to_csv('submission__blend.csv', index=False)