In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",100)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.preprocessing import StandardScaler, QuantileTransformer, RobustScaler, PowerTransformer, MinMaxScaler

#plt.rcParams['figure.dpi'] = 600
sns.set(rc={'figure.figsize':(6,6)})
import gc

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization, AlphaDropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K
import keras_tuner as kt
from kerastuner import BayesianOptimization
import os
from kaggle_datasets import KaggleDatasets

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
def set_all_seeds(seed):
    random.seed(seed) #python
    np.random.seed(seed) #numpy
    tf.random.set_seed(seed) # tf global seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Seed set to: {seed}")
seed = set_all_seeds(42)

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv", index_col="Id")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv", index_col="Id")
gc.collect()

In [None]:
(train.Cover_Type.value_counts()/ train.shape[0])*100

In [None]:
train = train[(train.Cover_Type != 4) & (train.Cover_Type != 5)].reset_index()
gc.collect()

In [None]:
cat_cols = [col for col in train.columns if (str(col).startswith("W") | str(col).startswith("Soil"))]
num_cols = [col for col in train.columns if (col not in cat_cols + ["Cover_Type"])]
gc.collect()

In [None]:
# Manhhattan distance to Hydrology
train["mnhttn_dist_hydrlgy"] = np.abs(train["Horizontal_Distance_To_Hydrology"]) + np.abs(train["Vertical_Distance_To_Hydrology"])
test["mnhttn_dist_hydrlgy"] = np.abs(test["Horizontal_Distance_To_Hydrology"]) + np.abs(test["Vertical_Distance_To_Hydrology"])

# Euclidean distance to Hydrology
train["ecldn_dist_hydrlgy"] = (train["Horizontal_Distance_To_Hydrology"]**2 + train["Vertical_Distance_To_Hydrology"]**2)**0.5
test["ecldn_dist_hydrlgy"] = (test["Horizontal_Distance_To_Hydrology"]**2 + test["Vertical_Distance_To_Hydrology"]**2)**0.5

In [None]:
hillshades = [col for col in train.columns if col.startswith('Hillshade')]
train[hillshades] = train[hillshades].clip(0, 255)
test[hillshades] = test[hillshades].clip(0, 255)
gc.collect()

In [None]:
train["Aspect"][train["Aspect"] < 0] = 360 - train["Aspect"]
train["Aspect"][train["Aspect"] > 359] = train["Aspect"] - 360

test["Aspect"][test["Aspect"] < 0] = 360 - test["Aspect"]
test["Aspect"][test["Aspect"] > 359] = test["Aspect"] - 360
gc.collect()

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

train["Hillshade_mean"] = train[features_Hillshade].mean(axis=1)
train['amp_Hillshade'] = train[features_Hillshade].max(axis=1) - train[features_Hillshade].min(axis=1)

test["Hillshade_mean"] = test[features_Hillshade].mean(axis=1)
test['amp_Hillshade'] = test[features_Hillshade].max(axis=1) - test[features_Hillshade].min(axis=1)

In [None]:
# Soil type count
soil_features = [col for col in train.columns if col.startswith("Soil")]
train["soil_type_count"] = train[soil_features].sum(axis=1)
test["soil_type_count"] = test[soil_features].sum(axis=1)

# Wilderness area count
wilderness_features = [col for col in train.columns if col.startswith("Wild")]
train["wilderness_area_count"] = train[wilderness_features].sum(axis=1)
test["wilderness_area_count"] = test[wilderness_features].sum(axis=1)
gc.collect()

In [None]:
train.drop("Id", axis=1, inplace=True)

In [None]:
num_cols = num_cols + ["soil_type_count", "wilderness_area_count", "Hillshade_mean","amp_Hillshade", 
                       "mnhttn_dist_hydrlgy", "ecldn_dist_hydrlgy"]
num_cols.remove("Id")
gc.collect()

In [None]:
from sklearn.preprocessing import LabelEncoder

y=train["Cover_Type"].copy()
encoder = LabelEncoder()
y = encoder.fit_transform(y)
gc.collect()

In [None]:
train.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
test.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)

In [None]:
X = train.drop("Cover_Type", axis=1)

In [None]:
X.shape, y.shape, test.shape

In [None]:
gc.collect()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
X = reduce_mem_usage(X, verbose=True)
test = reduce_mem_usage(test, verbose=True)
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.1, random_state=42, shuffle=True, stratify=y)

In [None]:
scaler = RobustScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])
#test[num_cols] = scaler.transform(test[num_cols])

In [None]:
K.clear_session()
gc.collect()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor="val_accuracy", factor=0.5, patience=4,mode='max', verbose=0)
es = EarlyStopping(monitor="val_accuracy", mode="max", patience=10, restore_best_weights=True) 

def my_model(X):   
    model = Sequential()
    model.add(Dense(256, activation= "relu",input_dim=X.shape[1]))
    model.add(BatchNormalization())
    model.add(Dropout(0.082208))

    model.add(Dense(128, activation= "relu"))
    model.add(BatchNormalization())   

    model.add(Dense(64, activation= "relu"))
    model.add(BatchNormalization())

    model.add(Dense(32, activation= "relu"))
    model.add(BatchNormalization())    

    model.add(Dense(len(encoder.classes_), activation= "softmax"))

    model.compile(optimizer=Adam(lr= 0.026257),
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])
    
    return model

In [None]:
gc.collect()
model= my_model(X_train)
history = model.fit(X_train, y_train,
                    validation_data = (X_valid, y_valid),callbacks=[es,reduce_lr ],
                    validation_batch_size=len(X_valid),                   
                    epochs=60, batch_size=2048, shuffle=True)

In [None]:
loss,acc = model.evaluate(X_valid, y_valid)
print("Accuracy",(acc*100), "%")

In [None]:
df_eval = pd.DataFrame({'train_loss': history.history['loss'], 'val_loss': history.history['val_loss'],
                       'train_accuracy': history.history['accuracy'], 'val_accuracy': history.history['val_accuracy']})

plt.plot(df_eval[["train_loss", "val_loss"]], label=["Train","Valid"])
plt.legend()
plt.title("Loss")

In [None]:
gc.collect()
plt.plot(df_eval[["train_accuracy", "val_accuracy"]], label=["Train","Valid"])
plt.title("Accuracy")
plt.legend()

In [None]:
from sklearn.metrics import confusion_matrix

pred_valid = np.argmax(model.predict(X_valid, batch_size=1024), axis=1)
cm=confusion_matrix(y_valid, np.argmax(model.predict(X_valid, batch_size=1024), axis=1))
cm = pd.DataFrame(cm, columns=encoder.classes_, index=encoder.classes_)
gc.collect()

In [None]:
gc.collect()
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True,fmt='.0f' ,cbar = False)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
#del model
K.clear_session()
gc.collect()

In [None]:
from sklearn.model_selection import StratifiedKFold

gc.collect()
gc.collect()
gc.collect()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores = {fold:None for fold in range(cv.n_splits)}
test_preds = []


for fold, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
    
    X_train, X_valid = X.iloc[idx_train], X.iloc[idx_valid]
    y_train, y_valid = y[idx_train], y[idx_valid]
    
    scaler = RobustScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    test_sc = test.copy()
    test_sc[num_cols] = scaler.transform(test_sc[num_cols])
    
    model = my_model(X_train)   
    history = model.fit(X_train, y_train,
                        validation_data = (X_valid, y_valid),
                        validation_batch_size=len(X_valid),
                        epochs=60, batch_size=2048,
                        callbacks=[es,reduce_lr],
                        shuffle=True,
                        verbose=0     )
    gc.collect()
    
    scores[fold] = (history.history)
    
    print(f"Fold {fold} -- Max Training AUC: {np.max(scores[fold]['accuracy']):.5f} -- Max Validation AUC: {np.max(scores[fold]['val_accuracy']):.5f}")
    
    test_preds.append(model.predict(test_sc, batch_size=2048))
    
print('**'*20)
gc.collect()

overall_train_auc = [np.max(scores[fold]['accuracy']) for fold in range(cv.n_splits)]
overall_valid_auc = [np.max(scores[fold]['val_accuracy']) for fold in range(cv.n_splits)]
print(f"Overall Mean Train AUC: {np.mean(overall_train_auc)} -- Overall Mean Validation AUC: {np.mean(overall_valid_auc)}")

del model
K.clear_session()
gc.collect()
gc.collect()
gc.collect()
gc.collect()
gc.collect()
gc.collect()

In [None]:
gc.collect()

for fold in range(10):
    df_eval = pd.DataFrame({'train_loss': scores[fold]['loss'], 'val_loss': scores[fold]['val_loss'],
                       'train_auc': scores[fold]['accuracy'], 'val_auc': scores[fold]['val_accuracy']})
    
    fig, ax = plt.subplots(1, 2, tight_layout=True, figsize=(10,4))
    fig.suptitle('Fold : '+str(fold), fontsize=14)
    
    plt.subplot(1,2,1)
    plt.plot(df_eval[["train_loss", "val_loss"]], label=["Train","Valid"])
    plt.legend()
    plt.title("Loss")
    
    plt.subplot(1,2,2)
    plt.plot(df_eval[["train_auc", "val_auc"]], label=["Train","Valid"])
    plt.title("Accuracy")
    plt.legend()

In [None]:
gc.collect()
sample_submission=pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
sample_submission['Cover_Type'] = encoder.inverse_transform(np.argmax(sum(test_preds), axis=1)) 
sample_submission.to_csv('./nn_model_not_normalized_robust_10fold.csv', index=False)

In [None]:
gc.collect()
plt.figure(figsize=(10,5))
ax = sns.countplot(x=sample_submission.Cover_Type)
plt.title("Predictions")
plt.xlabel("Cover Type")
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
gc.collect()
sample_submission.head(5)