## TPS-12 Tensorflow NN (model traning on GPU - to speed up learning process) and pseudolabeling

Special thanks to [AMBROSM](https://www.kaggle.com/ambrosm) for hist great notebook [TPSDEC21-01-Keras Quickstart](https://www.kaggle.com/ambrosm/tpsdec21-01-keras-quickstart) which was inspiration for this one.

This notebooks contains some noew ideas and improvements:
- pseudolabeling - made it separately - you can find it and use for own trainings in my dataset [TPS-12 Pseudolabels](https://www.kaggle.com/remekkinas/tps12-pseudolabels) - please vote on my database (it will be updated)
- TPU training (to speed up process)
- Tensorflow API isntead of Keras Sequential

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers

# 1. DATA PREPARATION

## 1.1 LOAD DATA AND PREPARE


In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv").drop(columns=['Soil_Type7', 'Soil_Type15']) 
test_df = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv").drop(columns=['Soil_Type7', 'Soil_Type15']) 
pseudolabels_df = pd.read_csv("../input/tps12-pseudolabels/tps12-pseudolabels_v2.csv").drop(columns=['Soil_Type7', 'Soil_Type15']) 

sample_submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

train_df = train_df[train_df.Cover_Type != 5]
train_df = train_df[train_df.Cover_Type != 4]
train_df = pd.concat([train_df, pseudolabels_df], axis=0)
train_df.reset_index(drop=True)

le = LabelEncoder()
target = le.fit_transform(train_df.Cover_Type)

## 1.2 DEFINE DATA PIPELINE PREPROCESSOR

In [None]:
features = [feat for feat in test_df.columns if feat != 'Id' and feat != 'Cover_Type']

data_pipe_transformer = make_pipeline(
    StandardScaler()
)

preprocessor = make_column_transformer(
    (data_pipe_transformer, features)
)

# 2. TPU CONFIGURATION

In [None]:
# if you want to traing on GPU just set it to False

TPU = True

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
    
print('Replicas:', strategy.num_replicas_in_sync)

# 3. MODEL CONFIGURATION

In [None]:
# I use skipped connection which improve score a little bit

def my_model(X):
    il = layers.Input(shape=(X.shape[-1]), name="input")
    x = layers.Dense(128, activation='selu')(il)
    x1 = layers.BatchNormalization()(x)
    x = layers.Dense(64, activation='selu')(x1)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(layers.Concatenate()([x, x1]))
    x = layers.Dense(units=64, activation='relu')(x) 
    x = layers.BatchNormalization()(x)
    x = layers.Dense(64, activation='selu')(x)
    x = layers.BatchNormalization()(x)
    output = layers.Dense(len(le.classes_), activation="softmax", name="output")(x)

    model = tf.keras.Model([il], output)
    return model

# 4. TRAINING

In [None]:
EPOCHS = 90 
VERBOSE = 2 
RUNS = 1 
BATCH_SIZE = 1024 
FOLDS = 10

np.random.seed(2021)
tf.random.set_seed(2021)

score_list, test_pred_list, history_list = [], [], []
oof_list = [np.full((len(train_df), len(le.classes_)), -1.0, dtype='float32') for run in range(RUNS)]
for run in range(RUNS):
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, y=train_df.Cover_Type)):
        print(f"Fold {run}.{fold}")

        X_tr = train_df.iloc[train_idx]
        X_va = train_df.iloc[val_idx]
        y_tr = target[train_idx]
        y_va = target[val_idx]
        X_tr = X_tr[features]
        X_va = X_va[features]
        
        X_tr = preprocessor.fit_transform(X_tr)
        X_va = preprocessor.transform(X_va)
        
        # TPU model
        if TPU:
            with strategy.scope():
                model = my_model(X_tr)
                model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
        else:
            
            # GPU model
            model = my_model(X_tr)
            model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
  

        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, 
                               patience=5, verbose=VERBOSE)

        es = EarlyStopping(monitor="val_acc", patience=10, 
                           verbose=VERBOSE, mode="max", 
                           restore_best_weights=True)

  
        history = model.fit(X_tr, y_tr, 
                            validation_data=(X_va, y_va), 
                            epochs=EPOCHS,
                            verbose=VERBOSE,
                            batch_size=BATCH_SIZE, 
                            validation_batch_size=len(X_va),
                            shuffle=True,
                            callbacks=[lr, es])
        history_list.append(history.history)
           
        y_va_pred = model.predict(X_va, batch_size=len(X_va))
        oof_list[run][val_idx] = y_va_pred
        y_va_pred = le.inverse_transform(np.argmax(y_va_pred, axis=1))

        accuracy = accuracy_score(train_df.iloc[val_idx].Cover_Type, y_va_pred)

        print(f"Fold {run}.{fold} | Epochs: {len(history_list[-1]['loss'])} | Accuracy: {accuracy:.5f}")
        
        test_pred_list.append(model.predict(preprocessor.transform(test_df[features]), batch_size=BATCH_SIZE))
        
        del model, y_va_pred
        gc.collect()

# 5. SUBMISSION

In [None]:
sub = test_df[['Id']].copy()
sub['Cover_Type'] = le.inverse_transform(np.argmax(sum(test_pred_list), axis=1)) 
sub.to_csv('tps12-pseudeo-submission.csv', index=False)