In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from scipy.stats import mode

import matplotlib.pyplot as plt
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
import keras
import tensorflow as tf
from tensorflow import keras
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler,MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import gc

from scipy.spatial import distance
gc.enable()

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
# Read 5 Fold Train, Test and Sample Submission Files
df_train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
df_submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
# Remove Row corresponding to Label 5 , since it has only 1 example
df_train = df_train[df_train.Cover_Type!=5]

As per [data](https://www.kaggle.com/c/forest-cover-type-prediction/data) definitions, following 3 columns should be between 0 to 255, but we can observe data <0 and >255, hence transforming to restrict between the range 
- Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
- Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
- Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice

In [None]:
for col_name in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm']:
    df_train.loc[df_train[col_name] < 0,col_name ] = 0
    df_test.loc[df_test[col_name] < 0,col_name ] = 0

    df_train.loc[df_train[col_name] > 255,col_name ] = 255
    df_test.loc[df_test[col_name] > 255,col_name ] = 255


In [None]:
# Removing Soil_Type7 and Soil_Type15 since all values are 0
df_train = df_train[[col for col in df_train.columns if col not in ('Soil_Type15','Soil_Type7')]]
df_test = df_test[[col for col in df_test.columns if col not in ('Soil_Type15','Soil_Type7')]]

There are 7 continuous variables :
- Elevation - Elevation in meters
- Aspect - Aspect in degrees azimuth
- Slope - Slope in degrees
- Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features
- Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features
- Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway
- Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points

- Aspect is in degress, which should be between 0 and 360, but we can observer data <0 and >360. Adding 360 to < 0 and subtracting 360 from >360 should do the work
- Calculating two new distance to Hydrology : Euclidean and Cosine
- Creating new column based on wilderness Area and Soil type

In [None]:
def feature_transform(df):
    # Adjusting Aspect
    df.loc[df['Aspect'] < 0,'Aspect' ] += 360
    df.loc[df['Aspect'] > 360,'Aspect' ] -= 360
    
    # Creating direction variables based on Aspect
    df['N'] = np.where((((df['Aspect'] >= 0) & (df['Aspect'] <= 22.5)) | ((df['Aspect'] >= 337.5) & (df['Aspect'] <= 360))),1,0)
    df['NE'] = np.where(((df['Aspect'] > 22.5) & (df['Aspect'] <= 67.5)),1,0)
    df['E'] = np.where(((df['Aspect'] > 67.5) & (df['Aspect'] <= 112.5)),1,0)
    df['SE'] = np.where(((df['Aspect'] > 112.5) & (df['Aspect'] <= 157.5)),1,0)
    df['S'] = np.where(((df['Aspect'] > 157.5) & (df['Aspect'] <= 202.5)),1,0)
    df['SW'] = np.where(((df['Aspect'] > 202.5) & (df['Aspect'] <= 247.5)),1,0)
    df['W'] = np.where(((df['Aspect'] > 247.5) & (df['Aspect'] <= 292.5)),1,0)
    df['NW'] = np.where(((df['Aspect'] > 292.5) & (df['Aspect'] < 337.5)),1,0)
    
    # Creating New distances
    
    df['manh_dist'] = np.abs((df['Horizontal_Distance_To_Hydrology'].astype(np.int32)) + (df['Vertical_Distance_To_Hydrology'].astype(np.int32)))
    df['euc_dist'] = np.sqrt((df['Horizontal_Distance_To_Hydrology'].astype(np.int32))**2 + (df['Vertical_Distance_To_Hydrology'].astype(np.int32))**2)
       
    
    # Creating new columns based on Winderness Area
    
    df['total_wilderness_area'] = df[[c for c in df.columns if c.startswith("Wilderness_Area")]].sum(axis=1)
    df["total_soil_type"] = df[[c for c in df.columns if c.startswith("Soil_Type")]].sum(axis=1)
    
    return df

In [None]:
cont_cols = ['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points',
            'manh_dist','euc_dist','total_wilderness_area','total_soil_type','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']

> **All follow normal distribution**

In [None]:
for x in cont_cols:
    try:
        sns.displot(data=df_train, x=x,hue="Cover_Type",kind = "kde", fill=True)
    except:
        continue

In [None]:
# Function to Remove outliers from data
def remove_outliers(x,method):
    if method == 'mean':
        upper_limit = x.mean() + (3*x.std())
        lower_limit = x.mean() - (3*x.std())
        return np.where(x > upper_limit,upper_limit,np.where(x <lower_limit,lower_limit,x))
    elif method == 'median':
        upper_limit = x.median() + (1.5*x.quantile(0.75))
        lower_limit = x.median() - (1.5*x.quantile(0.25))
        return np.where(x > upper_limit,upper_limit,np.where(x <lower_limit,lower_limit,x))
    else:
        return x

In [None]:
# Function to Scale and transform dataset
def data_scaler_fit(option,df):
    if option == 1:
        transformer = StandardScaler().fit(df)
    if option == 2 :
        transformer = RobustScaler().fit(df)
    if option == 3 :
        transformer = MinMaxScaler().fit(df)
    return transformer

In [None]:
df_train = feature_transform(df_train)
df_test = feature_transform(df_test)

In [None]:
useful_features = [c for c in df_train.columns if c not in ("Id", "Cover_Type", "kfold")]

cat_cols = [c for c in useful_features if c not in cont_cols]

# Using Standard scalar for all non categorical columns as original distribution for each column is close to normal
transformer = data_scaler_fit(2,df_train[cont_cols])

df_test = np.concatenate((transformer.transform(df_test[cont_cols].apply(lambda x: remove_outliers(x,'mean'))),df_test[cat_cols].to_numpy()),axis = 1)

In [None]:
df_train = df_train.reset_index(drop=True)
y = df_train['Cover_Type'].copy()
X = df_train.drop('Cover_Type',axis = 1)

In [None]:
final_test_predictions = []
final_valid_predictions = {}

scores = []
    

print("Training using Keras NN..")

batch_size = 1024
epochs = 100
verbose = 1

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for fold, (idx_train, idx_valid) in enumerate(cv.split(X,y)):
    xtrain, ytrain = X.iloc[idx_train], y[idx_train]
    xvalid, yvalid = X.iloc[idx_valid], y[idx_valid]
    
    xtest = df_test.copy()

    # Store IDs of validation Dataset
    valid_ids = xvalid.Id.values.tolist()
    
    #Save a copy of yvalid
    true_valid = yvalid
    
    #Label encoding Y
    le = preprocessing.LabelEncoder().fit(ytrain)

    ytrain = le.transform(ytrain)
    yvalid = le.transform(yvalid)
    

    n_class = len(np.unique(ytrain))

    xtrain = np.concatenate((transformer.transform(xtrain[cont_cols].apply(lambda x: remove_outliers(x,'mean'))),xtrain[cat_cols].to_numpy()),axis = 1)
    xvalid = np.concatenate((transformer.transform(xvalid[cont_cols].apply(lambda x: remove_outliers(x,'mean'))),xvalid[cat_cols].to_numpy()),axis = 1)

    model = keras.models.Sequential([
            keras.layers.Flatten(input_shape=[xtest.shape[1],]),
            keras.layers.Dense(600, activation="selu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(400, activation="selu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(300, activation="selu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(100, activation="selu"),
            keras.layers.Dense(n_class, activation="softmax")
            ])

    callback = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=20, verbose = verbose)

    reducelr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose = verbose)


    model.compile(loss="sparse_categorical_crossentropy",
                    optimizer="adam",
                    metrics=['accuracy'],
                 )

    # model.summary()

    model.fit(xtrain, ytrain, batch_size = batch_size, epochs=epochs,shuffle=True , validation_data=(xvalid, yvalid),callbacks=[callback,reducelr],verbose = verbose)

    preds_valid = le.inverse_transform(np.argmax(model.predict(xvalid), axis = 1))

    test_preds = le.inverse_transform(np.argmax(model.predict(xtest), axis = 1))

    final_test_predictions.append(test_preds)

    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

    acc_scr = accuracy_score(true_valid, preds_valid)
    
    
    print('_'*65)
    
    print(f"Fold {fold+1} || Accuracy : {acc_scr}")
    
    print('_'*65)
    
    print('\n')
    
    scores.append(acc_scr)
    
    gc.collect()
    
    keras.backend.clear_session()

In [None]:
df_submission.Cover_Type = mode(np.column_stack(final_test_predictions), axis=1)[0]
df_submission.columns = ["Id", "Cover_Type"]
df_submission.to_csv("submission.csv", index=False)