<center><h1 style = "font-size:20px;font-family: Copperplate">IMPORT THE LIBRARIES</h1></center>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from catboost import CatBoostClassifier, Pool

<center><h1 style = "font-size:20px;font-family: Copperplate">Data Loading & Preprocessing</h1></center>

In [None]:
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv")

In [None]:
print(train_data.shape)
print(test_data.shape)

train_data.head()

In [None]:
df = train_data[['Id', 'Cover_Type']].groupby(['Cover_Type']).count().rename(columns={"Id": "Count"})
def style_negative(v, props=''):
    return props if v < 2 else None
df.style.applymap(style_negative, props='color:white;background-color:darkred')

Just Drop Cover_Type 5.

In [None]:
train_data = train_data[train_data['Cover_Type']!=5]

In [None]:
X = train_data.drop(columns=['Id', 'Cover_Type'])
y = train_data[['Cover_Type']]

X_test = test_data.drop(columns=['Id'])

<center><h1 style = "font-size:20px;font-family: Copperplate">Feature Engineering</h1></center>

Please refer to [TPS Dec '21 | TensorFlow NN + Feature Engineering](https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering).
Amazing feature engineering skill.

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

X.rename(new_names, axis=1, inplace=True)
X_test.rename(new_names, axis=1, inplace=True)

X["Aspect"][X["Aspect"] < 0] += 360
X["Aspect"][X["Aspect"] > 359] -= 360

X_test["Aspect"][X_test["Aspect"] < 0] += 360
X_test["Aspect"][X_test["Aspect"] > 359] -= 360

# Manhhattan distance to Hydrology
X["mnhttn_dist_hydrlgy"] = np.abs(X["x_dist_hydrlgy"]) + np.abs(X["y_dist_hydrlgy"])
X_test["mnhttn_dist_hydrlgy"] = np.abs(X_test["x_dist_hydrlgy"]) + np.abs(X_test["y_dist_hydrlgy"])

# Euclidean distance to Hydrology
X["ecldn_dist_hydrlgy"] = (X["x_dist_hydrlgy"]**2 + X["y_dist_hydrlgy"]**2)**0.5
X_test["ecldn_dist_hydrlgy"] = (X_test["x_dist_hydrlgy"]**2 + X_test["y_dist_hydrlgy"]**2)**0.5

soil_features = [x for x in X.columns if x.startswith("Soil_Type")]
X["soil_type_count"] = X[soil_features].sum(axis=1)
X_test["soil_type_count"] = X_test[soil_features].sum(axis=1)

wilderness_features = [x for x in X.columns if x.startswith("Wilderness_Area")]
X["wilderness_area_count"] = X[wilderness_features].sum(axis=1)
X_test["wilderness_area_count"] = X_test[wilderness_features].sum(axis=1)

X.loc[X["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
X_test.loc[X_test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

X.loc[X["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
X_test.loc[X_test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

X.loc[X["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
X_test.loc[X_test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

X.loc[X["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
X_test.loc[X_test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

X.loc[X["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
X_test.loc[X_test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

X.loc[X["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
X_test.loc[X_test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

from sklearn.preprocessing import RobustScaler

cols = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "soil_type_count",
    "wilderness_area_count",
    "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    "soil_type_count",
    "wilderness_area_count"
]

scaler = RobustScaler()
X[cols] = scaler.fit_transform(X[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

X = reduce_mem_usage(X)
X_test = reduce_mem_usage(X_test)

<center><h1 style = "font-size:20px;font-family: Copperplate">CatBoost with KFold</h1></center>

In [None]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 42, shuffle = True)

<center><h1 style = "font-size:20px;font-family: Copperplate">Hyperparameter Optimization with Optuna</h1></center>

In [None]:
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    break

In [None]:
import optuna

def optimize_hp(trial):
    cb_params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.1, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        "use_best_model": True,
        "task_type": "GPU",
        'random_seed': 42
    }
    
    model = CatBoostClassifier(**cb_params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(optimize_hp, n_trials=10)
# print('Trials:', len(study.trials))
# print('Best parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

<center><h1 style = "font-size:20px;font-family: Copperplate">Train and Inference</h1></center>

In [None]:
cb_params = {
    'learning_rate': 0.3277295792305584,
    'l2_leaf_reg': 3.1572972266001518,
    'bagging_temperature': 0.6799604234141348,
    'random_strength': 1.99590400593318,
    'depth': 6,
    'min_data_in_leaf': 93,
    'iterations': 10000,
    'use_best_model': True,
    'task_type': 'GPU',
    'random_seed': 42
}
model = CatBoostClassifier(**cb_params)

In [None]:
CV = []
y_test_proba = 0
for i, (train_idx, val_idx) in tqdm(enumerate(kf.split(X)), total=K):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    fit_model = model.fit(X_train, y_train,
                          eval_set=Pool(X_val, y_val),
                          verbose=False,
                          use_best_model=True)
    
    # Get CV Score
    y_pred = fit_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    CV.append(accuracy)
    
    # Inference on test data
    y_test_proba += fit_model.predict_proba(X_test)
    
print(f'CV Score: {CV}')

In [None]:
answer = np.argmax(np.insert(y_test_proba/5, 4, 0, axis=1), axis=1) + 1

<center><h1 style = "font-size:20px;font-family: Copperplate">Submit</h1></center>

In [None]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
submission.loc[:, 'Cover_Type'] = answer
submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)