This is my continuation of our base line model: https://www.kaggle.com/tqrahman/baseline-with-tf-and-feature-engineering

In this notebook, we will focus on the neural network architecture
* Version 4
    * implemented PCA to add first five components as additional features
* Version 6
    * increased the network size
    * added Batch Normalization
    * added Skip Connections
* Version 8
    * applied clustering
    * removed frequncy encoding
    * feature engineered distance-to-hydrology

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix

import tensorflow as tf

In [None]:
# Reading in the data

train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
# Function to reduce memory

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Reducing memory usage

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
# Viewing the number of observations per category in target variable

train['Cover_Type'].value_counts()

In [None]:
# Getting the all the soil column names
soil_columns = [col for col in train.columns if 'Soil_' in col]

# Extracting winderness columns
wild_columns = [col for col in train.columns if 'Wild' in col]

# Categorical columns
cat_cols = soil_columns + wild_columns

## Feature Engineering

There are many dummy variables for 'soil_type'. We should check to see if each observation has one type or multiple types of soil. 

In [None]:
# Checking if an observation has more than one soil_type

train[soil_columns].sum(axis=1).value_counts()

Based on this value count, there are some observations that have multiple soil_types. This might be important for the model to know.

In [None]:
# Adding the number of soil_types as an additional feature

train['sum_soil_types'] = train[soil_columns].sum(axis=1)
test['sum_soil_types'] = test[soil_columns].sum(axis=1)

### Clustering data based on soil_types

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=7)
train['cluster'] = kmeans.fit_predict(train[soil_columns])
test['cluster'] = kmeans.predict(test[soil_columns])

### Soil Variables
It will be a good idea to remove 'Soil_Type7' and 'Soil_Type15'because it is 0's for all observations. Therefore it is not informative and might add noise to the model.

In [None]:
# Remove columns 'Soil_Type7', 'Soil_Type15'

train.drop(['Soil_Type7', 'Soil_Type15'], inplace=True, axis=1)
test.drop(['Soil_Type7', 'Soil_Type15'], inplace=True, axis=1)

### Hillshade

Hillshade is an "image" that ranges from 0-255. However some of the hillshade values are less than 0 or greater than 255. We will make an assumption that those were data entry errors and will clip them. If it is less than 0, we will set it to sero. If it is greater than 255, set it to 255.

Some additional thoughts:
* Set values under 0 to 0 and values greater than 255 to 255 for all Hillshade variables
* Is clipping the best way to procede? Try just scaling instead of clipping
* Remove the hillshade data that are NOT within the range between 0, 255

In [None]:
# Clipping the hillshade columns between 0 and 255

hillshade_columns = [col for col in train.columns if 'Hillshade' in col]

for col in hillshade_columns:
    train[col] = train[col].clip(0,255)
    test[col] = test[col].clip(0,255)

### Aspect
Aspect is in degress. It seems that it should be between 0-360 degrees. However some are below this range and exceeds this range. 

In [None]:
# Changing the range of Aspect to fall between 0 and 359

train['Aspect'] = train['Aspect'].apply(lambda row: row%360)
test['Aspect'] = test['Aspect'].apply(lambda row: row%360)

In [None]:
# Getting the features and target variables

features = [col for col in train.columns if col not in ['Id', 'Cover_Type']]
target = 'Cover_Type'

In [None]:
# Label encoding the target variable

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train[target] = le.fit_transform(train[target])

In [None]:
# Removing that single observation that has tree type '5' (or '4' after LabelEncoding)

train = train.loc[train['Cover_Type'] != 4,].reset_index(drop=True)
# train = train.loc[train['Cover_Type'] != 3,].reset_index(drop=True)

In [None]:
# Getting the all the soil column names
soil_columns = [col for col in train.columns if 'Soil_' in col]

# Extracting winderness columns
wild_columns = [col for col in train.columns if 'Wild' in col]

# Categorical columns
cat_cols = soil_columns + wild_columns

## Data Processing using PCA 

In [None]:
# Getting the numerical features

num_columns = [col for col in train.columns if col not in cat_cols+['Id', 'Cover_Type']]

In [None]:
# Scaling the data before applying PCA

scaler = StandardScaler()
pca_train = scaler.fit_transform(train[num_columns])
pca_test = scaler.transform(test[num_columns])

In [None]:
# Checking to see how many components are needed to explain the most variance

pca = PCA()
pca.fit(pca_train)
print(pca.explained_variance_ratio_)

In [None]:
# Plotting scree plots

PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
# Fitting a PCA 

pca = PCA(n_components=4)
pca_train = pca.fit_transform(pca_train)
pca_test = pca.transform(pca_test)

In [None]:
# Adding 4 components to the data

train = pd.concat([train, pd.DataFrame(pca_train,columns=['comp1', 'comp2', 'comp3', 'comp4'])], axis=1)
test = pd.concat([test, pd.DataFrame(pca_test,columns=['comp1', 'comp2', 'comp3', 'comp4'])], axis=1)

## Data Processing for Neural Network

#### Frequency Encoding for 'Soil' columns
There are a lot of 'Soil_' types. It may add a lot of noise for the model. A possiblity is to use a frequency encoding instead of dummying the variable.

In [None]:
# # Getting the all the soil column names
# soil_columns = [col for col in train.columns if 'Soil_' in col]

# # Undummying the Soil_types
# train['soil_type'] = train[soil_columns].idxmax(axis=1)
# test['soil_type'] = test[soil_columns].idxmax(axis=1)

# # Calculating the fequency encoding
# soil_map = pd.Series(train['soil_type'].value_counts()/train.shape[0]).to_dict()

# # Applying the frequency encoding
# train['soil_type'] = train['soil_type'].map(soil_map)
# test['soil_type'] = test['soil_type'].map(soil_map)

# # Dropping all the 'Soil-Type' columns
# train = train.drop(soil_columns, axis=1)
# test = test.drop(soil_columns, axis=1)

### Wilderness Variables
#### Frequency Encoding the 'Wilderness' column too

In [None]:
# Checking if an observation has more than one soil_type

train[wild_columns].sum(axis=1).value_counts()

In [None]:
wild_means = KMeans(n_clusters=7)
train['wild_cluster'] = wild_means.fit_predict(train[wild_columns])
test['wild_cluster'] = wild_means.predict(test[wild_columns])

In [None]:
total_means = KMeans(n_clusters=7)
train['total_cluster'] = total_means.fit_predict(train[cat_cols])
test['total_cluster'] = total_means.predict(test[cat_cols])

In [None]:
# Adding the number of wild_types as an additional feature

train['sum_wild_types'] = train[wild_columns].sum(axis=1)
test['sum_wild_types'] = test[wild_columns].sum(axis=1)

### Distance to Hydrology
There is a vertical and horizontal distance. It might be good to combine it to combine the components into one

In [None]:
# Function finding the Euclidean distance

def combine_components(row):
    return np.sqrt(np.square(row['Horizontal_Distance_To_Hydrology']) + np.square(row['Vertical_Distance_To_Hydrology']))

In [None]:
# Applying the function

train['distance_to_hydrology'] = train.apply(combine_components, axis=1)
test['distance_to_hydrology'] = test.apply(combine_components, axis=1)

In [None]:
# # Extracting winderness columns
# wild_columns = [col for col in train.columns if 'Wild' in col]

# # Undummying the wilderness_types
# train['wild_type'] = train[wild_columns].idxmax(axis=1)
# test['wild_type'] = test[wild_columns].idxmax(axis=1)

# # Calculating the fequency encoding
# wild_map = pd.Series(train['wild_type'].value_counts()/train.shape[0]).to_dict()

# # Applying the frequency encoding
# train['wild_type'] = train['wild_type'].map(wild_map)
# test['wild_type'] = test['wild_type'].map(wild_map)

# # Dropping all the 'Soil-Type' columns
# train = train.drop(wild_columns, axis=1)
# test = test.drop(wild_columns, axis=1)

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

## Processing for the Model

In [None]:
# Getting the features and target variables

columns = soil_columns + ['Id', 'Cover_Type', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'] + wild_columns

features = [col for col in train.columns if col not in columns]
target = 'Cover_Type'

In [None]:
# # Splitting the data into train and test splot

# X_train, X_valid, y_train, y_valid = train_test_split(
#     train[features], 
#     train[target],
#     stratify=train[target],
#     test_size=0.1, 
#     random_state=0
# )
# print(f'Shape of X_train: {X_train.shape}')
# print(f'Shape of y_train: {y_train.shape}')
# print(f'Shape of X_valid: {X_valid.shape}')
# print(f'Shape of y_valid: {y_valid.shape}')

In [None]:
# # Scaling the data by fitting on X_train and scaling the rest

# scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train)
# X_valid = scaler.transform(X_valid)
# # t = scaler.transform(test[features])

### Tensorflow Model

In [None]:
# Creating the model and compiling

def get_model(inputs):
    
    tf.keras.backend.clear_session()
    
    ## Setting the Inputs
    inputs = tf.keras.Input(shape=(inputs))
    x = inputs
    
    ## Dense Layers
    
    ### First layer
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    ### Second layer
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    ### Creating a skip conection
    conn1 = tf.keras.layers.Concatenate()([inputs,x])
    
    ### Third Layer
    x2 = tf.keras.layers.Dense(256, activation='relu')(conn1)
    x2 = tf.keras.layers.BatchNormalization()(x2)
    
    ### Fourth layer
    x2 = tf.keras.layers.Dense(128, activation='relu')(x2)
    x2 = tf.keras.layers.BatchNormalization()(x2)
    
    ### Creating a skip conection
    conn2 = tf.keras.layers.Concatenate()([x,x2])
    
    ### Fourth layer
    x3 = tf.keras.layers.Dense(256, activation='relu')(conn2)
    x3 = tf.keras.layers.BatchNormalization()(x3)
    
    ### Fifth layer
    x3 = tf.keras.layers.Dense(128, activation='relu')(x3)
    x3 = tf.keras.layers.BatchNormalization()(x3)
    
#     ### Sixth layer
#     x3 = tf.keras.layers.Dense(64, activation='relu')(x3)
#     x3 = tf.keras.layers.BatchNormalization()(x3)
    
    ## Output layer
    output = tf.keras.layers.Dense(7, activation='softmax')(x3)

    model = tf.keras.Model(inputs=inputs, outputs=output)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),
        loss="sparse_categorical_crossentropy",
        metrics=['acc']        
    )
    
    ## Returning the model
    return model

In [None]:
# # Function that creates a TF sequential model

# def get_model(inputs):
#     tf.keras.backend.clear_session()

#     ## Creating a Sequential Model
#     model = tf.keras.Sequential([
#         tf.keras.layers.Dense(512, input_shape=(None,inputs), activation='relu'),
#         tf.keras.layers.Dense(256, activation='relu'),
#         tf.keras.layers.Dense(128, activation='relu'),
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.Dense(7, activation = 'softmax')
#     ])
    
#     ## Compile 
#     model.compile(
#         optimizer="adam",
#         loss="sparse_categorical_crossentropy",
#         metrics=['acc']
#     )
    
#     return model

In [None]:
# Neural Network parameters

EPOCHS = 50
BATCH_SIZE = 2048

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_acc',
    factor = 0.5,
    patience = 3,
    verbose = 0,
    mode = 'max'
)
    
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_acc', 
    min_delta = 1e-06, 
    patience = 6, 
    verbose = 0,
    mode = 'max', 
    baseline = None,
    restore_best_weights = True
)

In [None]:
# # K-fold Cross Validation model evaluation

# X = X_train
# y = y_train.values

# FOLDS = 5
# cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

# test_preds = np.zeros((1, 1))
# scores = []
# for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    
#     ## Extracting the training and validation set from a fold
#     X_t, X_v = X[train_idx], X[val_idx]
#     y_t, y_v = y[train_idx], y[val_idx]
    
#     print('------------------------------------------------------------------------')
#     print(f'Training for fold {fold} ...')
    
#     ## Creating a model
#     model = get_model(X_train.shape[1])

#     ## Fit data to model
#     model.fit(
#         X_t,
#         y_t,
#         validation_data=(X_v, y_v),
#         epochs=EPOCHS,
#         batch_size=BATCH_SIZE,
#         verbose=2,
#         callbacks=[plateau, early_stopping]
#     )
    
#     ## Predicting using the model
#     y_pred = np.argmax(model.predict(X_v), axis=1)
#     score = accuracy_score(y_v, y_pred)
#     scores.append(score)
#     print(f'>Fold: {fold} --> Accuracy: {score}')

In [None]:
# # Printing the results from K-Fold

# print(f'Accuracy for each fold: {scores}')
# print(f'Mean of all the folds: {np.mean(scores):.4f}')
# print(f'Standard Deviation of the folds: {np.std(scores):.4f}')

After running K-Fold cross validation, the mean was .9485 with a 0.0002 standard deviation. It seems like the model is fitting well. We will rerun the model but with all the data instead of splitting it into train and validation set.

In [None]:
# # Creating a model

# model = get_model(train[features].shape[1])

# # Fit data to model
# model.fit(
#     X_train,
#     y_train,
#     validation_data=(X_valid, y_valid),
#     epochs=EPOCHS,
#     batch_size=BATCH_SIZE,
#     verbose=2,
#     callbacks=[plateau, early_stopping]
# )

## Model Analysis
### Confusion Matrix

In [None]:
# y_preds = model.predict(X_valid)
# y_preds = y_preds.argmax(axis=1)
# cm = confusion_matrix(y_preds, y_valid)

In [None]:
# ## Get Class Labels
# labels = le.classes_
# class_names = labels

# # Plot confusion matrix in a beautiful manner
# fig = plt.figure(figsize=(12, 12))
# ax= plt.subplot()
# sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells
# # labels, title and ticks
# ax.set_xlabel('Predicted', fontsize=20)
# ax.xaxis.set_label_position('bottom')
# plt.xticks(rotation=90)
# ax.xaxis.set_ticklabels(class_names, fontsize = 10)
# ax.xaxis.tick_bottom()

# ax.set_ylabel('True', fontsize=20)
# ax.yaxis.set_ticklabels(class_names, fontsize = 10)
# plt.yticks(rotation=0)

# plt.title('Refined Confusion Matrix', fontsize=20)

## Predictions

In [None]:
# Scaling the train

scaler = StandardScaler()

train_scaled = scaler.fit_transform(train[features])
test_scaled = scaler.transform(test[features])

In [None]:
# Creating a model

model = get_model(train[features].shape[1])

# Fit data to model
model.fit(
    train_scaled,
    train[target],
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=2,
    callbacks=[plateau, early_stopping]
)

In [None]:
# Predicting on the test set

preds = model.predict(test_scaled)

In [None]:
# Reversing the label encoder

final_preds = le.inverse_transform(preds.argmax(axis=1))

In [None]:
# Creating a submission file

submission = pd.DataFrame({'Id': test['Id'], 'Cover_Type': final_preds })
submission.to_csv('submission.csv', index=False)