In [None]:
import numpy as np
import pandas as pd
from IPython.display import display


pd.set_option("display.max_columns", None)

train_df = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission_df = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
display(train_df.head())

In [None]:
train_df.drop(["Id"], axis = 1, inplace = True)

In [None]:
print(train_df.columns[train_df.nunique() == 1])
print(test_df.columns[test_df.nunique() == 1])

In [None]:
train_df.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
test_df.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder


enc = LabelEncoder()
train_df["Cover_Type"] = enc.fit_transform(train_df["Cover_Type"])

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train_df.rename(new_names, axis=1, inplace=True)
test_df.rename(new_names, axis=1, inplace=True)

Aspect - values in degree ranging from 0 to 359

In [None]:
print(train_df["Aspect"].aggregate([min, max]))
print(test_df["Aspect"].aggregate([min, max]))

In [None]:
train_df["Aspect"][train_df["Aspect"] < 0] += 360
train_df["Aspect"][train_df["Aspect"] > 359] -= 360

test_df["Aspect"][test_df["Aspect"] < 0] += 360
test_df["Aspect"][test_df["Aspect"] > 359] -= 360

In [None]:
print(train_df["Aspect"].aggregate([min, max]))
print(test_df["Aspect"].aggregate([min, max]))

In [None]:
# Manhhattan distance
train_df["mnhttn_dist_hydrlgy"] = np.abs(train_df["x_dist_hydrlgy"]) + np.abs(train_df["y_dist_hydrlgy"])
test_df["mnhttn_dist_hydrlgy"] = np.abs(test_df["x_dist_hydrlgy"]) + np.abs(test_df["y_dist_hydrlgy"])

# Euclidean distance
train_df["ecldn_dist_hydrlgy"] = (train_df["x_dist_hydrlgy"]**2 + train_df["y_dist_hydrlgy"]**2)**0.5
test_df["ecldn_dist_hydrlgy"] = (test_df["x_dist_hydrlgy"]**2 + test_df["y_dist_hydrlgy"]**2)**0.5

Hillshading computes surface illumination as values from 0 to 255 based on a given compass direction to the sun (azimuth) and a certain altitude above the horizon (altitude)

In [None]:
times = ["9am", "Noon", "3pm"]
for time in times:
    print(train_df[f"Hillshade_{time}"].aggregate([min, max]))
    print(test_df[f"Hillshade_{time}"].aggregate([min, max]))

In [None]:
for time in times:
    train_df[f"Hillshade_{time}"][train_df[f"Hillshade_{time}"] < 0] = 0
    test_df[f"Hillshade_{time}"][test_df[f"Hillshade_{time}"] < 0] = 0
    
    train_df[f"Hillshade_{time}"][train_df[f"Hillshade_{time}"] > 255] = 255
    test_df[f"Hillshade_{time}"][test_df[f"Hillshade_{time}"] > 255] = 255

In [None]:
for time in times:
    print(train_df[f"Hillshade_{time}"].aggregate([min, max]))
    print(test_df[f"Hillshade_{time}"].aggregate([min, max]))

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in train_df.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train_df.columns if x.startswith("Wilderness_Area")]


def addFeature(df):
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Area_Count"] = df[wilderness_features].apply(sum, axis=1)
    df["Hillshade_mean"] = df[features_Hillshade].mean(axis=1)
    df['amp_Hillshade'] = df[features_Hillshade].max(axis=1) - df[features_Hillshade].min(axis=1)

In [None]:
addFeature(train_df)
addFeature(test_df)

In [None]:
cols_to_scale = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    "Soil_Count",
    "Wilderness_Area_Count",
    "Hillshade_mean",
    "amp_Hillshade"
]

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])
test_df[cols_to_scale] = scaler.transform(test_df[cols_to_scale])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.2f}% reduction)'.format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
 
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Self-Normalizing Neural Networks (SNNs) are neural networks which automatically keep their activations at zero-mean and unit-variance (per neuron). This is accomplished through the use of SeLU activation function which requires LeCun Normal kernel initialization.

In [None]:
import tensorflow as tf

In [None]:
INPUT_SHAPE = test_df.shape[1:]
NUM_CLASSES = train_df["Cover_Type"].nunique()

In [None]:
class self_norm_NN(tf.keras.Model):
    
    def __init__(self, inp_shape, num_classes, kernel_init = "lecun_normal", act = "selu"):
        super(self_norm_NN, self).__init__()
        self.d1 = tf.keras.layers.Dense(
            units = 300, kernel_initializer = kernel_init, 
            activation = act, input_shape = inp_shape
        )
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.d2 = tf.keras.layers.Dense(
            units = 200, kernel_initializer = kernel_init, 
            activation = act
        )
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.d3 = tf.keras.layers.Dense(
            units = 100, kernel_initializer = kernel_init, 
            activation = act
        )
        self.bn3 = tf.keras.layers.BatchNormalization()
        self.d4 = tf.keras.layers.Dense(
            units = 50, kernel_initializer = kernel_init, 
            activation = act
        )
        self.bn4 = tf.keras.layers.BatchNormalization()
        self.classifier = tf.keras.layers.Dense(
            units = num_classes, 
            activation = "softmax"
        )
        
    def call(self, input_tensor):
        x = self.d1(input_tensor)
        x = self.bn1(x)
        
        x = self.d2(x)
        x = self.bn2(x)
        
        x = self.d3(x)
        x = self.bn3(x)
        
        x = self.d4(x)
        x = self.bn4(x)
        
        return self.classifier(x)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5
)

early_stop = EarlyStopping(
    monitor="val_accuracy",
    patience=20,
    restore_best_weights=True
)

callbacks = [reduce_lr, early_stop]

In [None]:
test_df.drop(["Id"], axis = 1, inplace = True)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


X = train_df.drop("Cover_Type", axis=1).values
y = train_df["Cover_Type"].values

del train_df

FOLDS = 20
EPOCHS = 200
BATCH_SIZE = 2048

test_preds = np.zeros((1, 1))
scores = []

cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = self_norm_NN(INPUT_SHAPE, NUM_CLASSES)
    
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    
    model.fit(
        X_train,
        y_train,
        validation_data = (X_val, y_val),
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        callbacks = callbacks,
        verbose = True
    )

    y_pred = np.argmax(model.predict(X_val), axis = 1)
    score = accuracy_score(y_val, y_pred)
    scores.append(score)

    test_preds = test_preds + model.predict(test_df)
    print(f"Fold {fold} Accuracy: {score}")

print()
print(f"Mean Accuracy: {np.mean(scores)}")

In [None]:
test_preds = np.argmax(test_preds, axis=1)
test_preds = enc.inverse_transform(test_preds)

submission_df['Cover_Type'] = test_preds
display(submission_df.head())

In [None]:
submission_df.to_csv("submission.csv", index = False)