# Keras Quickstart

This notebook shows
- how to use Keras for this competition
- how to correctly cross-validate the model
- how to set a decreasing learning rate and early stopping
- how to plot the training curves
- how to ensemble the five models by soft voting
- how to save the models and the oof predictions for later use

You can enable GPU acceleration for this notebook to get the results faster, but you don't need the GPU.

Release notes:
- V1: -> lb 0.94821
- V2: Other network architecture (added one layer), 60 epochs
- V3: Hidden layers \[128, 64, 16\]
- V4: Hidden layers \[128, 64, 64, 16\] -> lb 0.95468
  - no real difference to V3
- V5: Fixed the voting classifier which was missing in earlier versions, added L2 regularization, LabelEncoder, drop Cover_Type 5 -> lb 0.95598
  - Voting makes a big difference
  - L2 regularization doesn't matter
- V6: Hidden layers \[128, 64, 64\], selu activation, 3 runs -> lb 0.95619
  - It seems that the architecture is somewhat better than before, but the two additional runs don't improve the lb score.
- V7: BatchNormalization improves the cv but not the lb, and almost doubles the running time. -> lb 0.95598
- V8: 10 folds, 90 epochs -> lb 0.95626
- V9: drop Soil_Type1 -> lb 0.95635
- V10: add Aspect180
  - cv score became worse
- V11: \[256, 128, 64\], other callback parameters
- V12: =V9 + oversampling -> lb 0.95559
- V13: oversampling after train-test split (failed)
- V14: clip some features -> lb 0.95615
- V15: early stopping on loss -> lb 0.95635 (same score as V9)
- V16: lecun_normal -> lb 0.95615
- V17: lecun_normal without BatchNormalization -> lb 0.95650
- V18: let's see the effect of a different seed
- V19: again another seed

In [None]:
import pandas as pd
import numpy as np
import pickle
import itertools
import gc
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from datetime import datetime
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder, minmax_scale
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Input, InputLayer, BatchNormalization 


In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

# Drop some columns, add features
for df in [train_df, test_df]:
    df.drop(columns=['Soil_Type7', 'Soil_Type15'], inplace=True) # These features are always zero
    df.drop(columns=['Soil_Type1'], inplace=True) # Feature is useless according to permutation feature importance
    #df['Aspect180'] = (df['Aspect'] + 180) % 360 # makes cv worse
    df["Hillshade_9am"] = df["Hillshade_9am"].clip(0, 255)
    df["Hillshade_Noon"] = df["Hillshade_Noon"].clip(0, 255)
    df["Hillshade_3pm"] = df["Hillshade_3pm"].clip(0, 255)
    df["Aspect"] = df["Aspect"].clip(0, 360)
features = [f for f in test_df.columns if f != 'Id' and f != 'Cover_Type']

# Reduce memory size
train_df = train_df.astype(np.float32)
test_df = test_df.astype(np.float32)
train_df['Id'] = train_df['Id'].astype(np.int32)
test_df['Id'] = test_df['Id'].astype(np.int32)
train_df['Cover_Type'] = train_df['Cover_Type'].astype(np.int32)

# Show the imbalanced class distribution
print("The imbalanced class distribution:")
print((train_df.groupby('Cover_Type').Id.count() / len(train_df)).apply(lambda p: f"{p:.3%}"))

# Drop Cover_Type 5 (the class with only one element can be ignored)
train_df = train_df[train_df.Cover_Type != 5]

# Prepare for multiclass classification
le = LabelEncoder()
target = le.fit_transform(train_df.Cover_Type) # renumbers the 6 classes from 0 to 5


In [None]:
# Plot training history
def plot_history(history, *, n_epochs=None, plot_lr=False, plot_acc=True, title=None, bottom=None, top=None):
    """Plot (the last unique n_epochs epochs of) the training history"""
    plt.figure(figsize=(15, 6))
    from_epoch = 0 if n_epochs is None else len(history['loss']) - n_epochs
    
    # Plot training and validation losses
    plt.plot(np.arange(from_epoch, len(history['loss'])), history['loss'][from_epoch:], label='Training loss')
    try:
        plt.plot(np.arange(from_epoch, len(history['loss'])), history['val_loss'][from_epoch:], label='Validation loss')
        best_epoch = np.argmin(np.array(history['val_loss']))
        best_val_loss = history['val_loss'][best_epoch]
        if best_epoch >= from_epoch:
            plt.scatter([best_epoch], [best_val_loss], c='r', label=f'Best val_loss = {best_val_loss:.5f}')
        if best_epoch > 0:
            almost_epoch = np.argmin(np.array(history['val_loss'])[:best_epoch])
            almost_val_loss = history['val_loss'][almost_epoch]
            if almost_epoch >= from_epoch:
                plt.scatter([almost_epoch], [almost_val_loss], c='orange', label='Second best val_loss')
    except KeyError:
        pass
    if bottom is not None: plt.ylim(bottom=bottom)
    if top is not None: plt.ylim(top=top)
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='lower left')
    if title is not None: plt.title(title)
        
    # Plot validation metrics
    if plot_acc:
        best_epoch = np.argmax(np.array(history['val_acc']))
        best_val_acc = history['val_acc'][best_epoch]
        ax2 = plt.gca().twinx()
        ax2.plot(np.arange(from_epoch, len(history['loss'])), np.array(history['val_acc'][from_epoch:]), color='r', label='Validation accuracy')
        if best_epoch >= from_epoch:
            plt.scatter([best_epoch], [best_val_acc], c='r', label=f'Best val_acc = {best_val_acc:.5f}')
        ax2.set_ylabel('Accuracy')
        ax2.legend(loc='center right')
        
    # Plot learning rate
    if plot_lr:
        ax2 = plt.gca().twinx()
        ax2.plot(np.arange(from_epoch, len(history['loss'])), np.array(history['lr'][from_epoch:]), color='g', label='Learning rate')
        ax2.set_ylabel('Learning rate')
        ax2.legend(loc='upper right')
        
    plt.show()
    

# Training

In [None]:
#%%time
EPOCHS = 90 # increase the number of epochs if the training curve indicates that a better result is possible
VERBOSE = 0 # set to 0 for less output, or to 2 for more output
SINGLE_FOLD = False # set to True for a quick experiment and to False for full cross-validation
RUNS = 1 # should be 1. increase the number of runs only if you want see how the result depends on the random seed
BATCH_SIZE = 1024 # if you set this too high, the notebook will crash (out of memory)
FOLDS = 10

def my_model(X):
    """Return a compiled Keras model"""
    model = Sequential()
    model.add(InputLayer(input_shape=(X.shape[-1])))
    
    # Add the hidden layers
    for size in [128, 64, 64]:
        model.add(Dense(size, kernel_initializer='lecun_normal', activation='selu'))
        #model.add(BatchNormalization())
        #model.add(LayerNormalization()) # LayerNormalization gives a similar score increase as BatchNormalization, but is slower
        #model.add(Dropout(rate=0.1)) # When I tried dropout, accuracy became worse.
        
    # Add the final layer with the correct activation function
    # Adding kernel_regularizer=tf.keras.regularizers.l2(l2=0.03) didn't make a difference
    model.add(Dense(len(le.classes_), activation='softmax'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
    return model

# Make the results reproducible
np.random.seed(202100)
tf.random.set_seed(202100)

total_start_time = datetime.now()
score_list, test_pred_list, history_list = [], [], []
oof_list = [np.full((len(train_df), len(le.classes_)), -1.0, dtype='float32') for run in range(RUNS)]
for run in range(RUNS):
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, y=train_df.Cover_Type)):
        print(f"Fold {run}.{fold}")
        start_time = datetime.now()
        X_tr = train_df.iloc[train_idx]
        X_va = train_df.iloc[val_idx]
        y_tr = le.transform(X_tr.Cover_Type)
        y_va = le.transform(X_va.Cover_Type)
        X_tr = X_tr[features]
        X_va = X_va[features]

        # Scale
        preproc = StandardScaler() # I tried QuantileTransformer, but StandardScaler seems to be better by 0.005
        X_tr = preproc.fit_transform(X_tr)
        X_va = preproc.transform(X_va)

        # Define two callbacks: ReduceLROnPlateau, EarlyStopping
        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, 
                               patience=4, verbose=1)

        es = EarlyStopping(monitor="val_loss", patience=10, 
                           verbose=VERBOSE, mode="min", 
                           restore_best_weights=True)

        # Train and save the model
        model = my_model(X_tr)
        history = model.fit(X_tr, y_tr, 
                            validation_data=(X_va, y_va), 
                            epochs=EPOCHS,
                            verbose=VERBOSE,
                            batch_size=BATCH_SIZE, 
                            validation_batch_size=len(X_va),
                            shuffle=True,
                            callbacks=[lr, es])
        history_list.append(history.history)
        model.save(f"model{run}.{fold}")
        
        # Inference for validation after last epoch of fold
        y_va_pred = model.predict(X_va, batch_size=len(X_va))
        oof_list[run][val_idx] = y_va_pred
        y_va_pred = np.argmax(y_va_pred, axis=1)

        # Evaluation
        accuracy = accuracy_score(y_va, y_va_pred)
        score_list.append((accuracy, datetime.now() - start_time))
        print(f"Fold {run}.{fold} | {str(datetime.now() - start_time)[-12:-7]} | Epochs: {len(history_list[-1]['loss'])} | Accuracy: {accuracy:.5f}")
        if run == 0: plot_history(history_list[-1], title=f"Accuracy: {accuracy:.5f}")

        # Inference for test: keep the predicted probabilities
        test_pred_list.append(model.predict(preproc.transform(test_df[features]), batch_size=BATCH_SIZE))
        
        # Clean up the memory (it seems that Keras doesn't clean up everything at keyboard interrupts)
        del model, y_va_pred
        gc.collect()
        
        if SINGLE_FOLD: break

# Save all oof and test predictions to later determine ensemble weights
with open('oof_list.pickle', 'wb') as handle: pickle.dump(oof_list, handle)
with open('test_pred_list.pickle', 'wb') as handle: pickle.dump(test_pred_list, handle)
    
total_time = datetime.now() - total_start_time


# Evaluation

In [None]:
# Overall evaluation
if oof_list[0].min() >= 0: # Can only evaluate if all folds have been done (set SINGLE_FOLD to False)
    
    # Evaluate the overall cv score
    print(f"Single-model Accuracy: {sum([accuracy_score(train_df.Cover_Type, le.inverse_transform(np.argmax(oof, axis=1))) for oof in oof_list]) / len(oof_list):.5f}")

    # Evaluate the number of epochs and the time taken
    print(f"Average epochs: {sum([len(h['loss']) for h in history_list]) / len(history_list):.0f}")
    print(f"Maximum epochs: {max([len(h['loss']) for h in history_list])}")
    print(f"Stopped early in {sum([len(h['loss']) < EPOCHS for h in history_list]) / len(history_list):.0%} of runs")
    print(f"Total elapsed time: {str(total_time)[-14:-7]} for {len(history_list)} trainings") 
    print()

    # Show the confusion matrix
    def plot_confusion_matrix(cm, classes, cm_type='recall'):
        if cm_type == 'recall':
            cm = cm / cm.sum(axis=1).reshape(-1, 1)
            colors = cm
            cell_format = '.0%'
            plt.title('Confusion matrix (sum of every row is 100 %, diagonal shows recall)', fontweight='bold', pad=15)
        elif cm_type == 'precision':
            cm = cm / cm.sum(axis=0).reshape(1, -1)
            colors = cm
            cell_format = '.0%'
            plt.title('Confusion matrix (sum of every column is 100 %, diagonal shows precision)', fontweight='bold', pad=15)
        elif cm_type == 'accuracy':
            cm = cm / cm.sum()
            colors = minmax_scale(cm.reshape(-1, 1)).reshape(cm.shape[0], cm.shape[1]) ** 0.3 # make the low-to-medium cells darker
            cell_format = '.2%'
            plt.title('Confusion matrix (sum of matrix is 100 %, sum of diagonal shows accuracy)', fontweight='bold', pad=15)
        elif cm_type == 'count':
            colors = minmax_scale(cm.reshape(-1, 1)).reshape(cm.shape[0], cm.shape[1]) ** 0.3 # make the low-to-medium cells darker
            cell_format = 'd'
            plt.title('Confusion matrix (sample counts)', fontweight='bold', pad=15)
        else: raise ValueError(f'Illegal value for parameter cm_type: {cm_type}')
        plt.imshow(colors, interpolation='nearest', cmap=plt.cm.Blues) # or cmap='hot'
        #plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=0)
        plt.yticks(tick_marks, classes)

        thresh = colors.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            value = cm[i, j]
            plt.text(j, i, format(value, cell_format),
                     horizontalalignment="center",
                     color="white" if colors[i, j] > thresh else "black")

        plt.ylabel('True label', fontweight='bold')
        plt.xlabel('Predicted label', fontweight='bold')
        plt.tight_layout()

    cm = confusion_matrix(train_df.Cover_Type, le.inverse_transform(np.argmax(oof_list[0], axis=1)))
    plt.figure(figsize=(11, 9))
    plot_confusion_matrix(cm, le.inverse_transform(np.arange(len(le.classes_))), cm_type='precision')
    plt.show()
    plt.figure(figsize=(11, 9))
    plot_confusion_matrix(cm, le.inverse_transform(np.arange(len(le.classes_))), cm_type='recall')
    plt.show()
    plt.figure(figsize=(11, 9))
    plot_confusion_matrix(cm, le.inverse_transform(np.arange(len(le.classes_))), cm_type='accuracy')
    plt.show()
    plt.figure(figsize=(11, 9))
    plot_confusion_matrix(cm, le.inverse_transform(np.arange(len(le.classes_))), cm_type='count')
    plt.show()
    
    # Print the classification report
    print(classification_report(train_df.Cover_Type, le.inverse_transform(np.argmax(oof_list[0], axis=1))))

In [None]:
# Create the submission file
sub = test_df[['Id']].copy()
sub['Cover_Type'] = le.inverse_transform(np.argmax(sum(test_pred_list), axis=1)) # soft voting by adding the probabilities of all models in the ensemble
sub.to_csv('submission.csv', index=False)

# Plot the distribution of the test predictions
plt.figure(figsize=(10,3))
plt.hist(train_df['Cover_Type'], bins=np.linspace(0.5, 7.5, 8), density=True, label='Train labels')
plt.hist(sub['Cover_Type'], bins=np.linspace(0.5, 7.5, 8), density=True, rwidth=0.7, label='Test predictions')
plt.xlabel('Cover_Type')
plt.ylabel('Frequency')
plt.gca().yaxis.set_major_formatter(PercentFormatter())
plt.legend()
plt.show()

sub.head()


Now it's your turn: Change the model architecture, add features, ... and see what happens!