# Part 1: Reading data and preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


plt.style.use("seaborn-darkgrid")
pd.set_option("display.max_columns", None)

train_df = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
sub_df = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

train_df.head()

Dropping columns - **Id**, **Soil_Type7**, **Soil_Type15**. Also dropping the row for which **Cover_Type=5**.

In [None]:
# Dropping columns - Id, Soil_Type7 and Soil_Type15
cols = ["Id", "Soil_Type7", "Soil_Type15"]

train_df.drop(cols, axis=1, inplace=True)
test_df.drop(cols, axis=1, inplace=True)

# Dropping the row with Cover_Type=5
idx = train_df[train_df["Cover_Type"] == 5].index
train_df.drop(idx, axis=0, inplace=True)

Renaming some columns.

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train_df.rename(new_names, axis=1, inplace=True)
test_df.rename(new_names, axis=1, inplace=True)

Encoding labels.

In [None]:
from sklearn.preprocessing import LabelEncoder


encoder = LabelEncoder()
train_df["Cover_Type"] = encoder.fit_transform(train_df["Cover_Type"])

# Part 2: Feature Engineering

Fixing ranges of **Aspect** and **Hillshade_*** features.

In [None]:
def fix_ranges(df):
    df.loc[df["Aspect"] < 0, "Aspect"] += 360
    df.loc[df["Aspect"] > 359, "Aspect"] -= 360

    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

    return df


train_df = fix_ranges(train_df)
test_df = fix_ranges(test_df)

Creating distance based features from **Horizontal_Distance_To_Hydrology** and **Vertical_Distance_To_Hydrology**.

In [None]:
def dist_feats(df):
    df["mnhttn_dist_hydrlgy"] = np.abs(df["x_dist_hydrlgy"]) + np.abs(df["y_dist_hydrlgy"])
    df["ecldn_dist_hydrlgy"] = (df["x_dist_hydrlgy"]**2 + df["y_dist_hydrlgy"]**2)**0.5

    return df


train_df = dist_feats(train_df)
test_df = dist_feats(test_df)

Creating features - [Sum of Soil_Types and Wilderness_Area](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/292823) - taken from discussions.

In [None]:
soil_features = [x for x in train_df.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train_df.columns if x.startswith("Wilderness_Area")]

def sum_feats(df):
    df["soil_type_count"] = df[soil_features].sum(axis=1)
    df["wilderness_area_count"] = df[wilderness_features].sum(axis=1)

    return df


train_df = sum_feats(train_df)
test_df = sum_feats(test_df)

Some new features taken from this discussions post: [Feature engineering update thread](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293612).

In [None]:
def r(x):
    if (x + 180) > 360:
        return x - 180
    else:
        return x + 180

def extra_feats(df):
    df['Aspect2'] = df["Aspect"].map(r)

    df["EHiElv"] = df["x_dist_rdwys"] * df["Elevation"]
    df['EViElv'] = df['y_dist_hydrlgy'] * df['Elevation']

    df['Highwater'] = (df["y_dist_hydrlgy"] < 0).astype(int)
    df['EVDtH'] = df["Elevation"] - df["y_dist_hydrlgy"]

    df['Hydro_Fire_1'] = df["x_dist_hydrlgy"] + df["x_dist_firepts"]
    df['Hydro_Fire_2'] = abs(df["x_dist_hydrlgy"] - df["x_dist_firepts"])

    df['Hydro_Road_1'] = abs(df["x_dist_hydrlgy"] + df["x_dist_rdwys"])
    df['Hydro_Road_2'] = abs(df["x_dist_hydrlgy"] - df["x_dist_rdwys"])

    df['Fire_Road_1'] = abs(df["x_dist_firepts"] + df["x_dist_rdwys"])
    df['Fire_Road_2'] = abs(df["x_dist_firepts"] - df["x_dist_rdwys"])

    df['Hillshade_3pm_is_zero'] = (df["Hillshade_3pm"] == 0).astype(int)

    return df


train_df = extra_feats(train_df)
test_df = extra_feats(test_df)

Scaling continuous features with RobustScaler.

In [None]:
from sklearn.preprocessing import RobustScaler


numerical_cols = [col for col in test_df.columns if test_df[col].nunique() > 2]
categorical_cols = list(set(test_df.columns) - set(numerical_cols))

scaler = RobustScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

Reducing the size of train and test dataframes

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df


train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

# Part 3: XGBoost Modelling

In [None]:
best_params = {
    "n_estimators": 1700,
    "max_depth": 12,
    "learning_rate": 0.01146513635635539,
    "gamma": 0.2594531967123816,
    "min_child_weight": 3.4942992853505186,
    "subsample": 0.6257232748066737,
    "colsample_bytree": 0.626827539397344,
    "reg_alpha": 6,
    "reg_lambda": 87
}

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


X = train_df.drop("Cover_Type", axis=1)
y = train_df[["Cover_Type"]]

del train_df

feat_imp = pd.Series(0, index=test_df.columns)
test_preds = np.zeros((1, 1))
scores = []

FOLDS = 10
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx, :], X.iloc[val_idx, :]
    y_train, y_val = y.iloc[train_idx, :], y.iloc[val_idx, :]

    clf = XGBClassifier(**best_params, tree_method="gpu_hist", use_label_encoder=False)
    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=False,
        eval_metric="mlogloss"
    )

    y_pred = clf.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    scores.append(score)

    feat_imp = feat_imp + pd.Series(clf.get_booster().get_score(importance_type="gain"))
    test_preds = test_preds + clf.predict_proba(test_df)

    print(f"Fold {fold} Accuracy: {score}")

print()
print(f"Mean Accuracy: {np.mean(scores)}")

Feature Importances.

In [None]:
feat_imp = feat_imp/FOLDS
feat_imp.nsmallest(len(feat_imp)).plot(kind="barh", figsize=(20, 20))

Using soft voting strategy to ensemble test predictions.

In [None]:
test_preds = np.argmax(test_preds/FOLDS, axis=1)
test_preds = encoder.inverse_transform(test_preds)

sub_df['Cover_Type'] = test_preds
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)