In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, InputLayer, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from tensorflow.keras import activations

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    return df

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
df_test =  pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

In [None]:
df_train.info()

In [None]:
#df_train.describe()

In [None]:
df_train.head()
ID = df_train['Id']
TEST_ID = df_test['Id']

In [None]:
df_train.drop(columns = 'Id', inplace=True)
df_test.drop(columns = 'Id', inplace=True)

helped me with plotting: https://www.kaggle.com/vishwas21/tps-nov-21-eda-modeling-grn-vsn
shout out to @Gulshan Mishra for the Aspect and Hillshade, :https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering/notebook#Part-3:-Modelling-with-Neural-Network

In [None]:
def plot_columns(df, df_test=df_test, columns=None, cols = 1, plotting_type=None):
    n_cols = len(columns)
    fig = plt.figure(figsize=(18, 18), facecolor='#EAEAF2')
    for n, title in enumerate(columns):
        a = fig.add_subplot(int(np.ceil(n_cols/float(cols))), cols, n + 1)
        if plotting_type == None: 
            sns.kdeplot(df[str(title)],color='#58D68D', label='Train data')
            sns.kdeplot(df_test[str(title)], color='#DE3163', label='Test data')
            a.set_ylabel('')
            a.set_xlabel(title, fontsize=8, fontweight='bold')
        elif plotting_type == 'boxplot':
            sns.boxplot(y=df[str(title)],color='#58D68D')
            #sns.boxplot(y=df_test[str(title)], color='#DE3163')
            a.set_ylabel('')
            a.set_xlabel(title, fontsize=8, fontweight='bold')
        else:
            print("Stop sh*tting.")
            return
    plt.subplots_adjust(hspace=0.3, wspace=0)
    plt.show()
    
def useful_att(df):
    
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    
    # Manhhattan distance to Hydrology
    df["mnhttn_dist_hydrlgy"] = np.abs(df["x_dist_hydrlgy"]) + np.abs(df["y_dist_hydrlgy"])
    # Euclidean distance to Hydrology
    df["ecldn_dist_hydrlgy"] = (df["x_dist_hydrlgy"]**2 + df["y_dist_hydrlgy"]**2)**0.5
    
    df["Soil_Count"] = df[soil_features].sum(axis=1)
    df["Wilderness_Area_Count"] = df[wilderness_features].sum(axis=1)
    df["Hillshade_mean"] = df[features_Hillshade].mean(axis=1)
    df['amp_Hillshade'] = df[features_Hillshade].max(axis=1) - df[features_Hillshade].min(axis=1)
    
#     df['min_row'] = df.min(axis=1)
#     df['max_row'] = df.max(axis=1)
#     df['mean_row'] = df.mean(axis=1)
#     df['std_row'] = df.std(axis=1)

def isnull_values_sum(df):
    return df.isnull().values.sum() > 0

def num_unique(df):
    return df.T.apply(lambda x: x.nunique(), axis=1)

In [None]:
#False means there aren't NaN
print(isnull_values_sum(df_train), isnull_values_sum(df_test))
print(num_unique(df_train))

In [None]:
dropped_cols = []
test = df_test.copy()
for col in df_train.columns:
    if df_train[col].nunique() < 2:
        dropped_cols.append(col)
        df_train.drop(columns=col, inplace=True)
        
num_unique(df_train)
test.drop(columns = dropped_cols, inplace=True)

In [None]:
LE = LabelEncoder()
train = df_train.copy()
train.drop(index=df_train.index[df_train['Cover_Type'] == 5].tolist(), inplace=True)
print(train['Cover_Type'].value_counts())
y = LE.fit_transform(train.Cover_Type)
train.drop(columns='Cover_Type', inplace=True)

In [None]:
np.unique(y)

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train.rename(new_names, axis=1, inplace=True)
test.rename(new_names, axis=1, inplace=True)

In [None]:
print(train.shape, test.shape)

In [None]:
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

In [None]:
useful_att(train)
useful_att(test)

In [None]:
cat_features = [col for col in train.columns if train[col].nunique() < 10]
cat_features

In [None]:
num_features = [col for col in train.columns if col not in cat_features]
num_features

In [None]:
# No NaN's but did it anyway.
modes = train[cat_features].mode().iloc[0]
means = train[num_features].mean(axis=0)
train[cat_features] = train[cat_features].fillna(modes)
test[cat_features] = test[cat_features].fillna(modes)
train[num_features] = train[num_features].fillna(modes)
test[num_features] = test[num_features].fillna(modes)

In [None]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [None]:
#plot_columns(df=train, df_test=test, columns=num_features, cols=4)

In [None]:
#plot_columns(df=train, df_test=test, columns=num_features, cols=4, plotting_type='boxplot')

In [None]:
scaled_features = cat_features
scaled_features.extend(['Aspect', 'ecldn_dist_hydrlgy'])
scaled_features

In [None]:
X = train.copy()
X_test = test.copy()

Z = pd.concat([train[scaled_features], test[scaled_features]],axis=0)
Z.head()

In [None]:
SS = StandardScaler()
# SS.fit(Z)
# X[scaled_features] = SS.transform(train[scaled_features])
# X_test[scaled_features] = SS.transform(test[scaled_features])

In [None]:
num_scaled_features = [col for col in train.columns if col not in scaled_features]
num_scaled_features

In [None]:
cols = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    
    "Soil_Count","Wilderness_Area_Count","Hillshade_mean","amp_Hillshade"
]
U = pd.concat([train[cols], test[cols]],axis=0)
U.head()

In [None]:

RS = RobustScaler()
RS.fit(U)
X[cols] = RS.transform(train[cols])
X_test[cols] = RS.transform(test[cols])

In [None]:
X.shape, y.shape, X_test.shape

In [None]:
X = X.to_numpy()
X_test = X_test.to_numpy()

In [None]:
del df_train, df_test, train, test, modes, means

# Modeling

In [None]:
input_shape = X.shape[1:]
num_classes = len(LE.classes_)
epochs = 200
batch_size = 2048
folds = 20
print(input_shape, num_classes)

In [None]:
def build_model2():
    model = Sequential()
    model.add(InputLayer(input_shape=(X.shape[-1])))
    for size in [300, 200, 100, 50]:
        model.add(Dense(size, kernel_initializer="lecun_normal"))
        model.add(BatchNormalization())
        model.add(Activation(activations.selu))
                  
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [None]:
def build_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(X.shape[-1])))
    for units, drop_perc in zip([1024, 512, 256], [0.3, 0.2, 0.1]):
        model.add(Dense(units, activation='selu', kernel_initializer="lecun_normal"))
        model.add(Dropout(drop_perc))
    model.add(Dense(128, activation='selu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [None]:
model = build_model()

RLR = ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.5,
    patience=5
)
ES = EarlyStopping(
    monitor="val_accuracy",
    patience=20,
    restore_best_weights=True
)

callbacks = [RLR, ES]

loss = tf.keras.losses.SparseCategoricalCrossentropy()
opt = tf.keras.optimizers.Adam()

model.compile(
    optimizer=opt,
    loss = loss,
    metrics = ["accuracy"])
model.summary()

In [None]:
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
def cross_validate_model(n_splits=5):
    histories = []
    SKFold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    y_test = np.zeros((X_test.shape[0], num_classes))

    for index, (train_index, val_index) in enumerate(SKFold.split(X, y)):
        print(f"Fold {index+1} out of {n_splits}")
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        history = model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(X_val, y_val),
            verbose=False
            )
        histories.append(history)
        preds = model.predict(X_val)
        y_pred = np.argmax(preds, axis=1)
        score = accuracy_score(y_val, y_pred)
        print(f"Score: {score}")
        y_test += model.predict(X_test)
        scores.append(score)
        

    return scores, y_test/n_splits

In [None]:
scores, y_test = cross_validate_model(n_splits=folds)

In [None]:
print(np.mean(scores))

In [None]:
y_test

In [None]:
test_preds2 = y_test

In [None]:
test_preds = LE.inverse_transform(np.argmax(y_test, axis=1))
sample_submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
sample_submission['Cover_Type'] = test_preds
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)