# TPC DEC 2021



*   [Tabular Playground Series - Dec 2021](https://www.kaggle.com/c/tabular-playground-series-dec-2021)
*   [Forest Cover Type Prediction](https://www.kaggle.com/c/forest-cover-type-prediction/data)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.style.use('seaborn')
sns.set(font_scale=2.5)
import missingno as msno

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

from sklearn.preprocessing import RobustScaler

In [None]:
path = "../input/tabular-playground-series-dec-2021/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

# Check Dataset


*   Elevation - Elevation in meters 고도
*   Aspect - Aspect in degrees azimuth 자신이 있는 곳은 수평으로 생각할때, 북극점으로부터 해당 지점까지 시계방향 각도
*   Slope - Slope in degrees 경사
*   Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features
*   Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features
*   Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway
*   Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
*   Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
*   Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice
*   Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points
*   Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation
*   Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation
*   Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation

In [None]:
print(train.shape)
train.head()

In [None]:
train.describe()

## Null Data Check

no null data

In [None]:
print(f"# of train data : {train.shape[0]}")
print(f"# of train features : {train.shape[1] - 1}")
print("")
print('='*15, " >> Null Data << ", '='*15)
null_feature = []
for col in train.columns:
    msg = 'column: {:>35}\t {:>10d} of {:<10d} ( Percent of Null value: {:.2f}% )'.format(col, train[col].isnull().sum(), train[col].shape[0], 100 * (train[col].isnull().sum() / train[col].shape[0]))
    print(msg)
    if train[col].isnull().sum() != 0:
        null_feature.append(col)


if len(null_feature) != 0:
    print("")
    print('='*15, " >> Warning << ", '='*15)
    print("NULL Feature : ", null_feature)

## Unique Data Check

2 Unique Data : 'Soil_Type7', 'Soil_Type15'

dataset is already one-hot encoded: Wilderness_Area and Soil_Type

In [None]:
print('='*15, " >> Unique Data << ", '='*15)

unique_col = []
for col in train.columns:
    msg = 'column: {:>35}\t {:>10d}'.format(col, len(train[col].unique()) )
    print(msg)
    if len(train[col].unique()) == 1:
        unique_col.append(col)

if len(unique_col) != 0:
    print("")
    print('='*15, " >> Warning << ", '='*15)
    print("Unique Feature : ", unique_col)

In [None]:
# move to Function "EngineerFeatures"
# print(f"train shape : Formerly, {train.shape}", end=" ")
# train = train.drop(unique_col, axis=1)
# print("-"*10, ">>", f" train shape : Now, {train.shape}", end=" ")

## Check Target Label

too imbalanced, expecially cover_type 4, 5

In [None]:
f, ax = plt.subplots(1, 2, figsize=(20, 12))

train['Cover_Type'].value_counts().plot.pie(autopct='%1.4f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Cover_Type', fontsize=16)
ax[0].set_ylabel('')
ax[0].tick_params(axis='both', labelsize=14)

ax[1].set_title('Count plot - Cover_Type', fontsize=16)
sns.countplot('Cover_Type', data=train, ax=ax[1])
ax[1].set_ylabel('count', fontsize = 14)
ax[1].set_xlabel('Cover_Type', fontsize = 14)
ax[1].tick_params(axis='both', labelsize=14)

plt.show()

In [None]:
print('='*15, " >> Target Data << ", '='*15)
num_target = train['Cover_Type'].value_counts()
targets = train['Cover_Type'].unique()
targets.sort()
for target in targets:
    msg = 'target: {:>3}\t {:>7d} of {:<10d} ( Percent of Null value: {:.2f}% )'.format(target,
                                                                                          num_target[target], train.shape[0], 
                                                                                          100 * (num_target[target] / train.shape[0]))
    print(msg)

# EDA

# Feature Engineering

[Sum of Soil_Type and Wilderness_Area](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/292823)

In [None]:
def EngineerFunction(df, is_train=True):
    df = df.drop(['Id'], axis=1)

    # Euclidean distance to Hydrology # Not Manhhattan distance, becuase it is environmental circumstance
    df.loc[:, "Pythagorian_Distance_To_Hydrology"] = np.hypot(df["Horizontal_Distance_To_Hydrology"], df["Vertical_Distance_To_Hydrology"])

    # remove unuseful features, becuse they are unique
    unique_cols = ['Soil_Type7', 'Soil_Type15']
    df = df.drop(unique_cols, axis=1)

    # Aspect is degree and -4 == 360 - 4 and 4 == 350 + 4
    # now, 0 <= Aspect <= 360
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360

    # because 0 <= hillshade <= 255
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

    if is_train == True:
        # for targets
        # df = pd.get_dummies(df, columns=["Cover_Type"])

        # Cover_type == 5 has only one data
        # idx = df[df["Cover_Type_5"] == 1].index
        idx = df[df["Cover_Type"] == 5].index
        df.drop(idx, axis=0, inplace=True)

    return df

train = EngineerFunction(train)
test = EngineerFunction(test, is_train=False)

how targets "Cover_Type_4" and "Cover_type_5"???

In [None]:
# target_cols = []
# for n in range(1, 8):
#   target_cols.append(f"Cover_Type_{n}")


# X_train = train.drop(target_cols, axis=1)
# y_train = train[target_cols]
X_train = train.drop("Cover_Type", axis=1)
y_train = train["Cover_Type"]

X_test = test

In [None]:
X_train

In [None]:
X_train.describe()

In [None]:
y_train

In [None]:
X_test

In [None]:
# Robust Scaler
RS = RobustScaler().fit(X_train)
X_train = RS.transform(X_train)
X_test = RS.transform(X_test)

# Build Model

In [None]:
import tensorflow as tf

from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split #StratifiedKFold

In [None]:
feat_dim = X_train.shape[1]
num_targets = 7 #y_train.shape[1]
dropout_rate = 0.1

def build_model():
    inputs = tf.keras.Input(shape=(feat_dim,))

    y1 = tf.keras.layers.Dense(128, activation='gelu')(inputs)
    y1 = tf.keras.layers.Dropout(dropout_rate)(y1)

    y2 = tf.keras.layers.Dense(128, activation='gelu')(y1)
    y2 = tf.keras.layers.Dropout(dropout_rate)(y2)
    y2 = tf.keras.layers.LayerNormalization()(y1 + y2)

    y3 = tf.keras.layers.Dense(64, activation='gelu')(y2)
    y3 = tf.keras.layers.Dropout(dropout_rate)(y3)

    y4 = tf.keras.layers.Dense(64, activation='gelu')(y3)
    y4 = tf.keras.layers.Dropout(dropout_rate)(y4)
    y4 = tf.keras.layers.LayerNormalization()(y3 + y4)

    outputs = tf.keras.layers.Dense(num_targets, activation='softmax')(y4)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    return model

In [None]:
model = build_model()
model.summary()

In [None]:
EPOCH = 50
BATCH_SIZE = 2**13

X, X_valid, y, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=21, shuffle=True, stratify=y_train)

y = pd.get_dummies(y)
y.loc[:, 5] = 0
y = y[[1, 2, 3, 4, 5, 6, 7]]
y_valid = pd.get_dummies(y_valid)
y_valid.loc[:, 5] = 0
y_valid = y_valid[[1, 2, 3, 4, 5, 6, 7]]
model = build_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics = ['categorical_accuracy'])

save_path = './'
checkpoint_folderpath = save_path + f"weights/"
checkpoint_filepath = save_path + f"weights/weights"
if os.path.isdir(checkpoint_folderpath):
    print(f"Loading Weights")
    model.load_weights(checkpoint_filepath)

sv = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True,
        save_weights_only=True, mode='max', save_freq='epoch', options=None)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_categorical_accuracy', patience=5)

history = model.fit(X, y, verbose=1,
                          validation_data=(X_valid, y_valid),
                          epochs=EPOCH, batch_size=BATCH_SIZE, callbacks=[sv, early_stop])

# Prediction

In [None]:
submission = pd.read_csv(path+'sample_submission.csv')
submission

In [None]:
predictions = model.predict(X_test, verbose=1, batch_size=BATCH_SIZE)

In [None]:
print(predictions.shape)
predictions

In [None]:
pred = []
for i in range(0, predictions.shape[0]):
    max_idx = np.argmax(predictions[i, :])
    pred.append(max_idx+1)

In [None]:
len(pred)

In [None]:
submission["Cover_Type"] = pred

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)