# Application of CatBoostClassifier on TPS December 2021
### Please let me know of any improvements, I'm here to learn

Used https://www.kaggle.com/chryzal/features-engineering-for-you for the feature engineering, give him a thumbs up!

In [None]:
import pandas as pd
import numpy as np
import datatable as dt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier
import tensorflow as tf # Just to see if gpu is connected

from tqdm import tqdm

In [None]:
def reduce_memory_usage(df, verbose=True):
    """Function to reduce memory of pandas dataframe"""
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

# Feature Engineering

* Read data using datatable and convert to pandas, is often faster than reading using pandas
* Remove Cover_Type = 5, only 1 sample on entire training data
* LabelEncode Cover_Type on training set, not necessary for catboost but good practice

In [None]:
train_df = dt.fread("../input/tabular-playground-series-dec-2021/train.csv")
test_df = dt.fread("../input/tabular-playground-series-dec-2021/test.csv")
test_df = reduce_memory_usage(test_df.to_pandas())
train_df = reduce_memory_usage(train_df.to_pandas())

# Remove sample with cover_type = 5
train_df.drop(train_df[train_df["Cover_Type"] == 5].index,
              axis=0,
              inplace=True)
train_df.drop('Id', axis=1, inplace=True)
test_df.drop('Id', axis=1, inplace=True)

encoder = LabelEncoder()
train_df["Cover_Type"] = encoder.fit_transform(train_df["Cover_Type"])

Drop Id, Soil_Type7 and Soil_Type15 from train and test dataset. Soil_Type7 and Soil_Type15 have same value in all rows

In [None]:
cols_to_drop = ["Soil_Type7", "Soil_Type15"]
train_df.drop(cols_to_drop, axis=1, inplace=True)
test_df.drop(cols_to_drop, axis=1, inplace=True)

# New Features
"Borrowed" from https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering 

Aspect is compass direction, should be between 0 and 360

In [None]:
train_df["Aspect"][train_df["Aspect"] < 0] += 360
train_df["Aspect"][train_df["Aspect"] > 359] -= 360

test_df["Aspect"][test_df["Aspect"] < 0] += 360
test_df["Aspect"][test_df["Aspect"] > 359] -= 360

Creating distance features from horizontal and vertical distance

In [None]:
train_df["dist2hydro"] = (train_df["Horizontal_Distance_To_Hydrology"]**0.5 + \
                          train_df["Vertical_Distance_To_Hydrology"]**0.5)
test_df["dist2hydro"] = (test_df["Horizontal_Distance_To_Hydrology"]**0.5 + \
                          test_df["Vertical_Distance_To_Hydrology"]**0.5)

Create sum of all soil types and sum of wilderness types

In [None]:
soil_feats = [i for i in train_df.columns if "soil" in i.lower()]
train_df["soil_type_cnt"] = train_df[soil_feats].sum(axis=1)
test_df["soil_type_cnt"] = test_df[soil_feats].sum(axis=1)

wild_feats = [i for i in train_df.columns if "wilderness" in i.lower()]
train_df["wild_feats_cnt"] = train_df[wild_feats].sum(axis=1)
test_df["wild_feats_cnt"] = test_df[wild_feats].sum(axis=1)

Hillshade values should be between 0 and 255

In [None]:
train_df.loc[train_df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test_df.loc[test_df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train_df.loc[train_df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test_df.loc[test_df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train_df.loc[train_df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test_df.loc[test_df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train_df.loc[train_df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test_df.loc[test_df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train_df.loc[train_df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test_df.loc[test_df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train_df.loc[train_df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test_df.loc[test_df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

### Scaling data
Great article on interesting ways to select pandas columns: https://towardsdatascience.com/interesting-ways-to-select-pandas-dataframe-columns-b29b82bbfb33

In [None]:
cols_to_scale = test_df.loc[:,[(test_df[col] > 1).any() for col in test_df.columns]].columns

scaler = RobustScaler()
train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])
test_df[cols_to_scale] = scaler.fit_transform(test_df[cols_to_scale])

# Split data and train model

In [None]:
FOLDS = 5 # Takes around 2.5 min per fold with 1000 iterations
iterations = 10000
test_preds = np.zeros((1,1))
scores = []

cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

y = train_df.pop("Cover_Type").values
X = train_df.values

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(X,y)), start=1):
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]

    model = CatBoostClassifier(iterations=iterations,
                          task_type="GPU",
                          devices="0:1")
    model.fit(
        X_train,
        y_train,
        verbose=False
    )

    y_pred = np.argmax(model.predict_proba(X_val), axis=1)

    score = accuracy_score(y_val, y_pred)
    print(f"Fold {fold} Validation Accuracy: {score}")
    scores.append(score)

    test_preds = test_preds + model.predict_proba(test_df)
    del model, score, y_pred # Try to save some memory
    
print(f"\n\nMean accuracy over all folds: {np.mean(scores)}")

In [None]:
subm_df = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
preds = np.argmax(test_preds, axis=1)
subm_df.Cover_Type = encoder.inverse_transform(preds)
subm_df.to_csv("Submission CB.csv", index=False)

In [None]:
print(set(subm_df.Cover_Type))