### Notes
* There is a leakage in the data
    - This is fixed. Leakage was in the cross validation part. Fixing that really boosted the performance of the model.
* Hyperparameter optimization is needed, also n_estimators is too small. 
* Maybe add bayesian search for hyperparameters, especially max_depth.
* Make the modelling part a function, and use it to make a baseline before all the feature engineering. Then use this baseline to assess each of the newly added features. Also use it to compare different models such as lightgbm, catboost or nn.
* the fold number is stupid. It was 5, I just made it 10. It may make sense to make it 20, but the ram isn't enough. Maybe ask someone to run the code for you. That increase means we will have around 5% more data, and we will be less prone to overfitting.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#for modelling
from xgboost import XGBClassifier

#for preprocessing and model selection
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [None]:
from matplotlib import ticker
import time
import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

train.drop(["Id"],axis=1,inplace = True)
test.drop(["Id"],axis=1,inplace = True)
TARGET = "Cover_Type"
FEATURES = [col for col in train.columns if col not in ['Id', TARGET]]
RANDOM_STATE = 69

In [None]:
print(f'Number of rows in train data: {train.shape[0]}')
print(f'Number of columns in train data: {train.shape[1]}')
print(f'No of missing values in train data: {sum(train.isna().sum())}')

In [None]:
cat_features = [col for col in FEATURES if train[col].nunique()<25]
cont_features = [col for col in FEATURES if train[col].nunique()>=25]

print(f'Total Number of Features :{len(FEATURES)}')
print(f'Number of Categorical Features :{len(cat_features)}')
print(f'Number of Continuos Features :{len(cont_features)}')


In [None]:
train = train.drop(index = int(np.where(train["Cover_Type"] == 5 )[0]))
train = train.drop(labels = ["Soil_Type7" , "Soil_Type15"] ,axis = 1)
test = test.drop(labels = ["Soil_Type7" , "Soil_Type15"] ,axis = 1)


In [None]:
FEATURES.remove('Soil_Type7')
FEATURES.remove('Soil_Type15')

In [None]:
train["Aspect"][train["Aspect"] < 0] += 360
train["Aspect"][train["Aspect"] > 359] -= 360

test["Aspect"][test["Aspect"] < 0] += 360
test["Aspect"][test["Aspect"] > 359] -= 360

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train.rename(new_names, axis=1, inplace=True)
test.rename(new_names, axis=1, inplace=True)

In [None]:
FEATURES.remove("Horizontal_Distance_To_Hydrology")
FEATURES.remove("Vertical_Distance_To_Hydrology")
FEATURES.remove("Horizontal_Distance_To_Roadways")
FEATURES.remove("Horizontal_Distance_To_Fire_Points")

FEATURES.extend(["x_dist_hydrlgy","y_dist_hydrlgy","x_dist_rdwys","x_dist_firepts"])

In [None]:
train["mnhttn_dist_hydrlgy"] = np.abs(train["x_dist_hydrlgy"]) + np.abs(train["y_dist_hydrlgy"])
test["mnhttn_dist_hydrlgy"] = np.abs(test["x_dist_hydrlgy"]) + np.abs(test["y_dist_hydrlgy"])

train["ecldn_dist_hydrlgy"] = (train["x_dist_hydrlgy"]**2 + train["y_dist_hydrlgy"]**2)**0.5
test["ecldn_dist_hydrlgy"] = (test["x_dist_hydrlgy"]**2 + test["y_dist_hydrlgy"]**2)**0.5

In [None]:
train.loc[train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train.loc[train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train.loc[train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train.loc[train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train.loc[train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train.loc[train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]

def addFeature(X):
    X["Soil_Count"] = X[soil_features].apply(sum, axis=1)
    X["Wilderness_Area_Count"] = X[wilderness_features].apply(sum, axis=1)
    X["Hillshade_mean"] = X[features_Hillshade].mean(axis=1)
    X['amp_Hillshade'] = X[features_Hillshade].max(axis=1) - X[features_Hillshade].min(axis=1)

In [None]:
addFeature(train)
addFeature(test)

In [None]:
train["mean"] = train[FEATURES].mean(axis=1)
train["std"] = train[FEATURES].std(axis=1)
train["min"] = train[FEATURES].min(axis=1)
train["max"] = train[FEATURES].max(axis=1)

test["mean"] = test[FEATURES].mean(axis=1)
test["std"] = test[FEATURES].std(axis=1)
test["min"] = test[FEATURES].min(axis=1)
test["max"] = test[FEATURES].max(axis=1)

FEATURES.extend(['mean', 'std', 'min', 'max'])

train.to_csv("train_engineered.csv",index=None)
test.to_csv("test_engineered.csv",index=None)

In [None]:
from sklearn.pipeline import Pipeline
X = train.drop(["Cover_Type"],axis=1)
y = train["Cover_Type"]

pipe = Pipeline([
    ('scaler',StandardScaler()),
   
])

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization,Dropout
from tensorflow.keras import layers
import tensorflow as tf


INPUT_SHAPE = test.shape[1:]
NUM_CLASSES = train["Cover_Type"].nunique()

def build_model():
    with tf.device('/device:GPU:0'):
        model = Sequential([
            Dense(units=128, kernel_initializer="lecun_normal", activation="selu", input_shape=INPUT_SHAPE),
            BatchNormalization(),
            Dense(units=64, kernel_initializer="lecun_normal", activation="selu"),
            BatchNormalization(),
            Dense(units=64, kernel_initializer="lecun_normal", activation="relu"),
            BatchNormalization(),
            Dense(units=64, kernel_initializer="lecun_normal", activation="relu"),
            BatchNormalization(),
            Dense(units=NUM_CLASSES, activation="softmax")
        ])

        model.compile(
            optimizer="adam",
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"]
        )

    return model


from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5
)

early_stop = EarlyStopping(
    monitor="val_accuracy",
    patience=20,
    restore_best_weights=True
)

callbacks = [reduce_lr, early_stop]

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train,True)
test = reduce_mem_usage(test,True)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

FOLDS = 20
EPOCHS = 200
BATCH_SIZE = 128

nn_scores = []
nn_preds = []

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=RANDOM_STATE)
for fold,(train_idx,val_idx) in enumerate(kf.split(X=X,y=y)):
    print(10*'-',f'fold: {fold+1}',10*'-')
    start_time = time.time()
    x_train = X.iloc[train_idx,:]
    x_val = X.iloc[val_idx,:]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    x_train = pipe.fit_transform(x_train)
    x_val = pipe.transform(x_val)
    
    model = build_model()
    with tf.device('/device:GPU:0'):
        model.fit(
            x_train,
            y_train,
            validation_data=(x_val, y_val),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=callbacks,
            verbose=False
        )
    del x_train
    del y_train
    preds_val = np.argmaz(model.predict(x_val),axis=1)
    acc = accuracy_score(y_val,preds_val)
    del preds_val
    del x_val
    del y_val
    nn_scores.append(acc)
    run_time = time.time()-start_time
    print(f'Fold: {fold+1} accuracy: {acc} runtime: {run_time}')
    del acc
    test_temp = pipe.transform(test)
    test_preds = model.predict(test_temp)
    nn_preds.append(test_preds)
    del model
    del test_temp
    

    
print("Mean Accuracy:",np.mean(nn_scores))
    

In [None]:
xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'num_estimators' : 1000,
    'early_stopping' : 200,
    'max_depth' : 7
    }


xgb_scores = []
xgb_preds = []

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=RANDOM_STATE)
for fold,(train_idx,val_idx) in enumerate(kf.split(X=X,y=y)):
    print(10*'-',f'fold: {fold+1}',10*'-')
    start_time = time.time()
    x_train = X.iloc[train_idx,:]
    x_val = X.iloc[val_idx,:]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    x_train = pipe.fit_transform(x_train)
    x_val = pipe.transform(x_val)
    
    model = XGBClassifier(**xgb_params)
    model.fit(x_train,y_train,eval_set=[(x_val,y_val)],verbose=0)
    del x_train
    del y_train
    preds_val = model.predict(x_val)
    acc = accuracy_score(y_val,preds_val)
    del preds_val
    del x_val
    del y_val
    xgb_scores.append(acc)
    run_time = time.time()-start_time
    print(f'Fold: {fold+1} accuracy: {acc} runtime: {run_time}')
    del acc
    test_temp = pipe.transform(test)
    test_preds = model.predict(test_temp)
    xgb_preds.append(test_preds)
    del model
    del test_temp
    

    
print("Mean Accuracy:",np.mean(xgb_scores))
    

cv score for maxdepth=7, 10 folds, and early_stopping=200: 0.9615592403099757(lb score:0.95491. This is overfitting)

cv score for maxdepth=7, 5 folds, and early_stopping=100: 0.9615987403931131


In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
from sklearn.pipeline import Pipeline
lgb_params = {
    'objective' : 'multiclass',
    'metric' : 'multi_logloss',
    'device' : 'gpu',
    }
pipe = Pipeline([
    ('scaler',StandardScaler()),
   
])

lgb_scores = []
lgb_preds = []

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=RANDOM_STATE)
for fold,(train_idx,val_idx) in enumerate(kf.split(X=X,y=y)):
    print(10*'-',f'fold: {fold+1}',10*'-')
    start_time = time.time()
    x_train = X.iloc[train_idx,:]
    x_val = X.iloc[val_idx,:]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    x_train = pipe.fit_transform(x_train)
    x_val = pipe.transform(x_val)
    
    model = LGBMClassifier(**lgb_params)
    model.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_val,y_val)],verbose=0)
    del x_train
    del y_train
    preds_val = model.predict(x_val)
    acc = accuracy_score(y_val,preds_val)
    del preds_val
    del x_val
    del y_val
    lgb_scores.append(acc)
    run_time = time.time()-start_time
    print(f'Fold: {fold+1} accuracy: {acc} runtime: {run_time}')
    del acc
    test_temp = pipe.transform(test)
    test_preds = model.predict(test_temp)
    lgb_preds.append(test_preds)
    del model
    del test_temp
    
    
    
print("Mean Accuracy:",np.mean(lgb_scores))
    

In [None]:
catb_params = {
    "objective": "MultiClass",
    "task_type": "GPU",
}

catb_preds = []
catb_scores = []

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=RANDOM_STATE)
for fold,(train_idx,val_idx) in enumerate(kf.split(X=X,y=y)):
    print(10*'-',f'fold: {fold+1}',10*'-')
    start_time = time.time()
    x_train = X.iloc[train_idx,:]
    x_val = X.iloc[val_idx,:]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    x_train = pipe.fit_transform(x_train)
    x_val = pipe.transform(x_val)
    
    model = CatBoostClassifier(**catb_params)
    model.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_val,y_val)],verbose=0)
    del x_train
    del y_train
    preds_val = model.predict(x_val)
    acc = accuracy_score(y_val,preds_val)
    del preds_val
    del x_val
    del y_val
    catb_scores.append(acc)
    run_time = time.time()-start_time
    print(f'Fold: {fold+1} accuracy: {acc} runtime: {run_time}')
    del acc
    test_temp = pipe.transform(test)
    test_preds = model.predict(test_temp)
    catb_preds.append(test_preds)
    del model
    del test_temp
    
    
    
print("Mean Accuracy:",np.mean(catb_scores))

In [None]:
print("Mean Accuracy:",np.mean(catb_scores))

In [None]:
xgb_submission = submission.copy()
xgb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(xgb_preds),axis = 1)[0]).astype('int')
xgb_submission.to_csv("xgb-subs.csv",index=None)
xgb_submission.head()

In [None]:
xgb_submission.head(20)

In [None]:
lgb_submission = submission.copy()
lgb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(lgb_preds),axis = 1)[0]).astype('int')
lgb_submission.to_csv("lgb-subs.csv",index=None)
lgb_submission.head()

In [None]:
catb_submission = submission.copy()
catb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(catb_preds),axis = 1)[0]).astype('int')
catb_submission.to_csv("catb-subs.csv",index=None)
catb_submission.head()

In [None]:
total_preds = xgb_preds+lgb_preds+catb_preds
total_submission = submission.copy()
total_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(total_preds),axis = 1)[0]).astype('int')
total_submission.to_csv("total-subs.csv",index=None)
total_submission.head()