# My application of a simple neural net on playground december 2021
### Please let me know of any improvements, I'm here to learn

### Ideas for improvement
* Feature engineering, Cover_Type = 5 is only 1 sample, remove? DONE
* Encode using sklearn labelencoder (need to use encoder.inverse_transform for test preds later) DONE
* Scale data using sklearn robustscaler DONE
* Plot model using tf.keras.utils plot_model
* Use some tool to do feature importance
* Can run on TPU, DONE
* Get lower TPU idle time. Does anyone have any idea how?
* Would be interesting to see if more folds give more accuracy
* Try not removing cover_type = 4

Used https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering as inspiration, please go give that notebook a thumbs up


In [None]:
import pandas as pd
import numpy as np
import datatable as dt

from sklearn.model_selection import train_test_split, StratifiedKFold 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, RobustScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import plot_model
import tensorflow as tf

plot = False # Plot model or plot summary
VERBOSE = False # Show all outputs

## Function to reduce memory of dataframes

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

# Importing training and testing data
Reading using datatable and converting to pandas is often faster than reading directly using pandas

In [None]:
train_df = dt.fread("../input/tabular-playground-series-dec-2021/train.csv")
test_df = dt.fread("../input/tabular-playground-series-dec-2021/test.csv")
test_df = reduce_memory_usage(test_df.to_pandas())
train_df = reduce_memory_usage(train_df.to_pandas())

INPUT_SHAPE = test_df.shape[1:] # Used to decide first layer of nn
NUM_CLASSES = train_df["Cover_Type"].nunique() # For output layer of nn

# Remove sample with cover_type = 5
idx_to_drop5 = train_df[train_df["Cover_Type"] == 5].index
print(f"Nr of cover_type = 5: {len(idx_to_drop5)}")
train_df.drop(idx_to_drop5,
              axis=0,
              inplace=True)

# Very few is 4 aswell
"""idx_to_drop4 = train_df[train_df["Cover_Type"] == 4].index
print(f"Nr of cover_type = 4: {len(idx_to_drop4)}")
train_df.drop(idx_to_drop4,
              axis=0,
              inplace=True)"""


encoder = LabelEncoder()
train_df["Cover_Type"] = encoder.fit_transform(train_df["Cover_Type"])

bool_features = [i for i in train_df.columns if "area" in i.lower() or "soil" in i.lower()]
test_df[bool_features] = test_df[bool_features].astype(np.int8)
train_df[bool_features] = train_df[bool_features].astype(np.int8)


### Scale unscaled data
Great article on interesting ways to select pandas columns:
https://towardsdatascience.com/interesting-ways-to-select-pandas-dataframe-columns-b29b82bbfb33

In [None]:
cols_to_scale = train_df.loc[:,[(train_df[col] > 7).any() for col in train_df.columns]].columns
print(f"Scaled Columns: {cols_to_scale}\n\n  \
Number of scaled Columns: {len(cols_to_scale)}")

scaler = RobustScaler()
train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])
test_df[cols_to_scale] = scaler.fit_transform(test_df[cols_to_scale])

y = train_df.pop("Cover_Type").values
X = train_df.values

## Functions to use when training later
Reduce learningrate when accuracy is plateauing and stop early if accuracy is not improving

In [None]:
reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=VERBOSE
)
early_stop = EarlyStopping(
    monitor="val_accuracy",
    patience=15,
    restore_best_weights=True,
    verbose=True # Always show on which fold it stopped early
)
callbacks = [reduce_lr, early_stop]

# Define the model and compile it

In [None]:
def build_model():
    # To run on TPU
    build_with_TPU = False
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        BATCH_SIZE = 1024
        # print(f"Running on TPU: {tpu.master()}")
        # print(f"Batch Size on TPU: {BATCH_SIZE}")
        build_with_TPU = True
    except ValueError:
        BATCH_SIZE = 1024
        # print("Not running on TPU")
        # strategy = tf.distribute.get_strategy()
        # BATCH_SIZE = 512
        # print(f"Running on {strategy.num_replicas_in_sync} replicas")
        # print(f"Batch Size: {BATCH_SIZE}")
        
    if build_with_TPU:
        with strategy.scope():
            model = Sequential([
                Dense(units=300, kernel_initializer='random_normal', activation='gelu',
                      input_shape=INPUT_SHAPE),
                BatchNormalization(),
                Dense(units=200, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=100, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=30, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=6, activation="softmax")
            ])
            model.compile(
                optimizer='adam',
                loss = 'sparse_categorical_crossentropy',
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
                steps_per_execution=32 # Just a random value, don't know what to use here
            )
    else:
        model = Sequential([
                Dense(units=300, kernel_initializer='random_normal', activation='gelu',
                      input_shape=INPUT_SHAPE),
                BatchNormalization(),
                Dense(units=200, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=100, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=30, kernel_initializer='random_normal', activation='gelu'),
                BatchNormalization(),
                Dense(units=6, activation="softmax")
            ])
        model.compile(
            optimizer='adam',
            loss = 'sparse_categorical_crossentropy',
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )
            
    return model

if plot:
    plot_model(
        build_model(),
        show_shapes=True,
        show_layer_names=True
    )
else:
    build_model().summary()

# Train the model
Trains the model {FOLDS} times, and adds result to predictions to make all models effect result

In [None]:
print("Num GPUs available: ", len(tf.config.list_physical_devices('GPU')))

FOLDS = 5
EPOCHS = 200
BATCH_SIZE = 2048
STEPS_PER_EPOCH = 4*BATCH_SIZE # Not used, chosen if wanted faster epochs
test_preds = np.zeros((1,1))
scores = []

cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

for fold, (train_idx, test_idx) in enumerate(cv.split(X,y), start=1):
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]

    model = build_model()
    model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        # steps_per_epoch=STEPS_PER_EPOCH,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=callbacks,
        verbose=VERBOSE
    )

    y_pred = np.argmax(model.predict(X_val), axis=1)

    score = accuracy_score(y_val, y_pred)
    print(f"Fold {fold}/{FOLDS} Validation Accuracy: {score}")
    scores.append(score)

    test_preds = test_preds + model.predict(test_df)
    
print(f"\n\nMean accuracy over all folds: {np.mean(scores)}")

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
preds = np.argmax(test_preds, axis=1)
preds = encoder.inverse_transform(preds)

sample.Cover_Type = preds
sample.to_csv("Submission.csv", index=False)