# Cover Type Prediction: Model selection

Sébastien Meyer

In [None]:
from datetime import datetime

from tqdm import tqdm

import numpy as np
import pandas as pd
from numpy.random import MT19937, RandomState, SeedSequence

import tensorflow as tf

from sklearn.cluster import KMeans
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, ExtraTreesRegressor, StackingClassifier,
    VotingClassifier
)
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, LabelBinarizer, StandardScaler, RobustScaler
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
font = {"size": 22}
mpl.rc("font", **font)

pd.options.mode.chained_assignment = None  # default='warn'

seed = 8005

np.random.seed(seed)
rs = RandomState(MT19937(SeedSequence(seed)))
tf.random.set_seed(seed)

merge_kdf = True  # use of knowledge domain features ?
merge_pw = True  # use of polynomial features ?

In [None]:
# Read files
test_df = pd.read_csv("data/covtype.csv", index_col=["Id"])

training_ids = []

with open("data/training_ids.txt", "r", encoding="utf-8") as f:

    training_ids = f.read().split(",")
    training_ids = [int(x) for x in training_ids]

train_df = test_df.iloc[training_ids, :].copy()

# Shuffle
train_df = shuffle(train_df, random_state=rs)
test_df = shuffle(test_df, random_state=rs)

# Eliminate useless features
if "Soil_Type15" in train_df.columns:
    train_df.drop(columns=["Soil_Type15"], inplace=True)
    test_df.drop(columns=["Soil_Type15"], inplace=True)

# Correct missing values of Hillshade_3pm
etr_h3pm = ExtraTreesRegressor(random_state=rs, n_jobs=-1)

h3pm_var = "Hillshade_3pm"
train_h3pm_pos = train_df.index[train_df[h3pm_var] != 0].tolist()
train_h3pm_zeros = train_df.index[train_df[h3pm_var] == 0].tolist()
test_h3pm_zeros = test_df.index[test_df[h3pm_var] == 0].tolist()

etr_h3pm.fit(train_df.drop(columns=[h3pm_var]+["Cover_Type"]).loc[train_h3pm_pos, :],
             train_df.loc[train_h3pm_pos, h3pm_var])

train_df.loc[train_h3pm_zeros, h3pm_var] = \
    etr_h3pm.predict(train_df.drop(columns=[h3pm_var]+["Cover_Type"]).loc[train_h3pm_zeros, :])
test_df.loc[test_h3pm_zeros, h3pm_var] = \
    etr_h3pm.predict(test_df.drop(columns=[h3pm_var]+["Cover_Type"]).loc[test_h3pm_zeros, :])

# Binary features and target
wild_var = [f"Wilderness_Area{i}" for i in range(1, 5)]
soil_var = [f"Soil_Type{i}" for i in range(1, 41) if i != 15]
label_var = ["Cover_Type"]

# Separate discrete and continuous features
all_var = train_df.columns
disc_var = wild_var + soil_var + label_var
cont_var = [x for x in all_var if x not in disc_var]

## Utils

This function allows to drop columns (uniquely) that are too highly correlated.

In [None]:
def drop_corr_feat(train_df, test_df=None, corr_threshold=1.):
    if corr_threshold >= 1.:
        return

    print("\nDropping correlated features...")

    # Compute correlation
    corr_df = train_df.corr()

    # Empty dictionary to hold correlated features
    above_threshold_cols = {}

    # For each column, record the features that are above the threshold
    for col in corr_df:
        above_threshold_cols[col] = list(corr_df.index[corr_df[col] > corr_threshold])

    # Track columns to remove and columns already examined
    cols_to_remove = set()
    cols_seen = set()
    cols_to_remove_pair = set()

    # Iterate through columns and correlated columns
    for col, corr_cols in tqdm(above_threshold_cols.items()):

        # Keep track of columns already examined
        cols_seen.add(col)

        for x in corr_cols:

            if x != col:  # a variable is totally correlated with itself
                # Only want to remove one in a pair
                if x not in cols_seen:
                    cols_to_remove.add(x)
                    cols_to_remove_pair.add(col)

    # Remove highly correlated features
    list_cols_to_remove = list(cols_to_remove)

    train_df.drop(columns=list_cols_to_remove, inplace=True)
    if test_df is not None:
        test_df.drop(columns=list_cols_to_remove, inplace=True)

    print(f"Number of features after decorrelation: {train_df.shape[1]}")

## Knowledge-domain features

In [None]:
# Separate discrete and continuous features
all_var = train_df.columns
disc_var = wild_var + soil_var + label_var
cont_var = [x for x in all_var if x not in disc_var]

train_kdf = pd.DataFrame([], index=train_df.index)
test_kdf = pd.DataFrame([], index=test_df.index)

# # Binary features
# soil_var = ["Soil_Type{}".format(i) for i in range(1, 41) if i != 15]
# wild_var = ["Wilderness_Area{}".format(i) for i in range(1, 5)]

# Merge Wilderness_Area
# s = train_df[wild_var].idxmax(axis=1).str[15:].astype(int) - 1
# train_kdf["Wilderness_Area"] = s
# train_df = train_df.drop(columns=wild_var)

# s = test_df[wild_var].idxmax(axis=1).str[15:].astype(int) - 1
# test_kdf["Wilderness_Area"] = s
# test_df = test_df.drop(columns=wild_var)

# s = train_df[soil_var].idxmax(axis=1).str[9:].astype(int) - 1
# train_kdf["Soil_Type"] = s
# train_df = train_df.drop(columns=soil_var)

# # Add the features on test data

# s = test_df[soil_var].idxmax(axis=1).str[9:].astype(int) - 1
# test_df["Soil_Type"] = s
# test_df = test_df.drop(columns=soil_var)

# Add ratio of distances to hydrology
# train_kdf["Ratio_Distance_To_Hydrology"] = \
# train_df["Vertical_Distance_To_Hydrology"]/train_df["Horizontal_Distance_To_Hydrology"]
# test_kdf["Ratio_Distance_To_Hydrology"] = \
# test_df["Vertical_Distance_To_Hydrology"]/test_df["Horizontal_Distance_To_Hydrology"]

# imp = SimpleImputer(strategy="median")  # there might be missing values for the ratio (0 horizontal distance)

# train_kdf[["Ratio_Distance_To_Hydrology"]] = imp.fit_transform(train_kdf[["Ratio_Distance_To_Hydrology"]])
# test_kdf[["Ratio_Distance_To_Hydrology"]] = imp.transform(test_kdf[["Ratio_Distance_To_Hydrology"]])

# Add Log of Elevation
# train_kdf["Elevation_Log"] = np.log(1+train_df["Elevation"])
# test_kdf["Elevation_Log"] = np.log(1+test_df["Elevation"])

# Add Log of distance to hydrology
train_kdf["Horizontal_Distance_To_Hydrology_Log"] = np.log(1+train_df["Horizontal_Distance_To_Hydrology"])
test_kdf["Horizontal_Distance_To_Hydrology_Log"] = np.log(1+test_df["Horizontal_Distance_To_Hydrology"])

# Add Log of distance to roadways
train_kdf["Horizontal_Distance_To_Roadways_Log"] = np.log(1+train_df["Horizontal_Distance_To_Roadways"])
test_kdf["Horizontal_Distance_To_Roadways_Log"] = np.log(1+test_df["Horizontal_Distance_To_Roadways"])

# Add Log of distance to fire points
train_kdf["Horizontal_Distance_To_Fire_Points_Log"] = np.log(1+train_df["Horizontal_Distance_To_Fire_Points"])
test_kdf["Horizontal_Distance_To_Fire_Points_Log"] = np.log(1+test_df["Horizontal_Distance_To_Fire_Points"])

# Add Max of known values (numerical features)
train_kdf["Max"] = train_df[cont_var].max(axis=1)
test_kdf["Max"] = test_df[cont_var].max(axis=1)

# train_kdf["Min"] = train_df[cont_var].min(axis=1)
# test_kdf["Min"] = test_df[cont_var].min(axis=1)

train_kdf["Std"] = train_df[cont_var].std(axis=1)
test_kdf["Std"] = test_df[cont_var].std(axis=1)

# train_kdf["Mean"] = train_df[cont_var].mean(axis=1)
# test_kdf["Mean"] = test_df[cont_var].mean(axis=1)

# Sign of vertical distance
# train_kdf["Vertical_Distance_To_Hydrology_Sign"] = (train_df["Vertical_Distance_To_Hydrology"] > 0).astype(int)
# test_kdf["Vertical_Distance_To_Hydrology_Sign"] = (test_df["Vertical_Distance_To_Hydrology"] > 0).astype(int)

# We have the Aspect variable that is between 0 and 360
# train_kdf["Shifted_Aspect"] = train_df["Aspect"] - 180
# test_kdf["Shifted_Aspect"] = test_df["Aspect"] - 180

# train_df["Shifted_Aspect_Sign"] = (train_kdf["Shifted_Aspect"] > 0).astype(int)
# test_df["Shifted_Aspect_Sign"] = (test_kdf["Shifted_Aspect"] > 0).astype(int)

# We have the horizontal and vertical distances, let's compute the total distance
h_d = "Horizontal_Distance_To_Hydrology"
v_d = "Vertical_Distance_To_Hydrology"
train_kdf["Distance_To_Hydrology"] = (train_df[h_d].pow(2) + train_df[v_d].pow(2)).pow(0.5)
test_kdf["Distance_To_Hydrology"] = (test_df[h_d].pow(2) + test_df[v_d].pow(2)).pow(0.5)

# Make some differences and additions of similar features
# hillshades_var = ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]

# for i in range(len(hillshades_var)):
#     for j in range(i+1, len(hillshades_var)):

#         var1 = hillshades_var[i]
#         var2 = hillshades_var[j]

#         train_kdf[var1+"_plus_"+var2] = train_df[var1] + train_df[var2]
#         train_kdf[var1+"_minus_"+var2] = np.abs(train_df[var1] - train_df[var2])

#         test_kdf[var1+"_plus_"+var2] = test_df[var1] + test_df[var2]
#         test_kdf[var1+"_minus_"+var2] = np.abs(test_df[var1] - test_df[var2])

h_dist_var = [
    "Horizontal_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Roadways"
]

for i in range(len(h_dist_var)):
    for j in range(i+1, len(h_dist_var)):

        var1 = h_dist_var[i]
        var2 = h_dist_var[j]

        train_kdf[var1+"_plus_"+var2] = train_df[var1] + train_df[var2]
        train_kdf[var1+"_minus_"+var2] = train_df[var1] - train_df[var2]

        test_kdf[var1+"_plus_"+var2] = test_df[var1] + test_df[var2]
        test_kdf[var1+"_minus_"+var2] = test_df[var1] - test_df[var2]

# Also add the sum of all three distances
train_kdf["Mean_Distance_To_Points_Of_Interest"] = \
    train_df["Horizontal_Distance_To_Hydrology"] + train_df["Horizontal_Distance_To_Fire_Points"] + \
    train_df["Horizontal_Distance_To_Roadways"]
test_kdf["Mean_Distance_To_Points_Of_Interest"] = \
    test_df["Horizontal_Distance_To_Hydrology"] + test_df["Horizontal_Distance_To_Fire_Points"] + \
    test_df["Horizontal_Distance_To_Roadways"]    
    
# Our EDA has shown that Elevation and Distances to hydrology share a particular relationship
train_kdf["Elevation_Shifted_Horizontal_Distance_To_Hydrology"] = \
    train_df["Elevation"] - 0.2*train_df["Horizontal_Distance_To_Hydrology"]
test_kdf["Elevation_Shifted_Horizontal_Distance_To_Hydrology"] = \
    test_df["Elevation"] - 0.2*test_df["Horizontal_Distance_To_Hydrology"]

train_kdf["Elevation_Shifted_Vertical_Distance_To_Hydrology"] = \
    train_df["Elevation"] - train_df["Vertical_Distance_To_Hydrology"]
test_kdf["Elevation_Shifted_Vertical_Distance_To_Hydrology"] = \
    test_df["Elevation"] - test_df["Vertical_Distance_To_Hydrology"]

train_kdf["Elevation_Shifted_Horizontal_Distance_To_Roadways"] = \
    train_df["Elevation"] - 0.02*train_df["Horizontal_Distance_To_Roadways"]
test_kdf["Elevation_Shifted_Horizontal_Distance_To_Roadways"] = \
    test_df["Elevation"] - 0.02*test_df["Horizontal_Distance_To_Roadways"]

# Binning features
# cut_points = [0, 2575, 3100, 8000]
# train_kdf["Elevation_Plateau"] = pd.cut(train_df["Elevation"], cut_points, labels=[0, 1, 2]).astype(int)
# test_kdf["Elevation_Plateau"] = pd.cut(test_df["Elevation"], cut_points, labels=[0, 1, 2]).astype(int)

# Tweaking the binary variables
# train_kdf["Soil_Type12_32"] = train_df["Soil_Type32"] + train_df["Soil_Type12"]
# test_kdf["Soil_Type12_32"] = test_df["Soil_Type32"] + test_df["Soil_Type12"]

# train_kdf["Soil_Type23_22_32_33"] = train_df["Soil_Type23"] + \
#     train_df["Soil_Type22"] + train_df["Soil_Type32"] + train_df["Soil_Type33"]
# test_kdf["Soil_Type23_22_32_33"] = test_df["Soil_Type23"] + \
#     test_df["Soil_Type22"] + test_df["Soil_Type32"] + test_df["Soil_Type33"]

# Mean hillshade
train_kdf["Mean_Hillshade"] = train_df["Hillshade_9am"] + train_df["Hillshade_Noon"] + train_df["Hillshade_3pm"]
test_kdf["Mean_Hillshade"] = test_df["Hillshade_9am"] + test_df["Hillshade_Noon"] + test_df["Hillshade_3pm"]

# Features drawn from hierarchical clustering
train_kdf["Aspect Hillshade_3pm"] = train_df["Aspect"] * train_df["Hillshade_3pm"]
test_kdf["Aspect Hillshade_3pm"] = test_df["Aspect"] * test_df["Hillshade_3pm"]

# train_kdf["Wilderness_Area1_plus_Soil_Type29"] = train_df["Wilderness_Area1"] + train_df["Soil_Type29"]
# test_kdf["Wilderness_Area1_plus_Soil_Type29"] = test_df["Wilderness_Area1"] + test_df["Soil_Type29"]
# if "Wilderness_Area1_plus_Soil_Type29" not in disc_var:
#     disc_var.append("Wilderness_Area1_plus_Soil_Type29")

# train_kdf["Wilderness_Area1_minus_Soil_Type29"] = train_df["Wilderness_Area1"] - train_df["Soil_Type29"]
# test_kdf["Wilderness_Area1_minus_Soil_Type29"] = test_df["Wilderness_Area1"] - test_df["Soil_Type29"]
# if "Wilderness_Area1_minus_Soil_Type29" not in disc_var:
#     disc_var.append("Wilderness_Area1_minus_Soil_Type29")

# train_kdf["Wilderness_Area4_plus_Soil_Type3"] = train_df["Wilderness_Area4"] + train_df["Soil_Type3"]
# test_kdf["Wilderness_Area4_plus_Soil_Type3"] = test_df["Wilderness_Area4"] + test_df["Soil_Type3"]
# if "Wilderness_Area4_plus_Soil_Type3" not in disc_var:
#     disc_var.append("Wilderness_Area4_plus_Soil_Type3")

# train_kdf["Wilderness_Area4_minus_Soil_Type3"] = train_df["Wilderness_Area4"] - train_df["Soil_Type3"]
# test_kdf["Wilderness_Area4_minus_Soil_Type3"] = test_df["Wilderness_Area4"] - test_df["Soil_Type3"]
# if "Wilderness_Area4_minus_Soil_Type3" not in disc_var:
#     disc_var.append("Wilderness_Area4_minus_Soil_Type3")

# Associations of soil types
ratake = [2, 4]
vanet = [2, 5, 6]
catamount = [10, 11, 13, 26, 31, 32, 33]
leighan = [21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 38]
bullwark = [10, 11]
como = [29, 30]
moran = [38, 39, 40]
other = [3, 14, 15, 16, 19, 20, 34, 35, 37]

ratake_dict = {i: 1 if i in ratake else 0 for i in range(1, 41)}
vanet_dict = {i: 1 if i in vanet else 0 for i in range(1, 41)}
catamount_dict = {i: 1 if i in catamount else 0 for i in range(1, 41)}
leighan_dict = {i: 1 if i in leighan else 0 for i in range(1, 41)}
bullwark_dict = {i: 1 if i in bullwark else 0 for i in range(1, 41)}
como_dict = {i: 1 if i in como else 0 for i in range(1, 41)}
moran_dict = {i: 1 if i in moran else 0 for i in range(1, 41)}
other_dict = {i: 1 if i in other else 0 for i in range(1, 41)}

soil_var = ["Soil_Type{}".format(i) for i in range(1, 41) if i != 15]

train_soil_types = train_df[soil_var].idxmax(axis=1).str[9:].astype(int)

train_kdf["Ratake_Family_Soil_Type"] = train_soil_types.map(ratake_dict)
train_kdf["Vanet_Family_Soil_Type"] = train_soil_types.map(vanet_dict)
train_kdf["Catamount_Family_Soil_Type"] = train_soil_types.map(catamount_dict)
train_kdf["Leighan_Family_Soil_Type"] = train_soil_types.map(leighan_dict)
train_kdf["Bullwark_Family_Soil_Type"] = train_soil_types.map(bullwark_dict)
train_kdf["Como_Family_Soil_Type"] = train_soil_types.map(como_dict)
train_kdf["Moran_Family_Soil_Type"] = train_soil_types.map(moran_dict)
train_kdf["Other_Family_Soil_Type"] = train_soil_types.map(other_dict)

test_soil_types = test_df[soil_var].idxmax(axis=1).str[9:].astype(int)

test_kdf["Ratake_Family_Soil_Type"] = test_soil_types.map(ratake_dict)
test_kdf["Vanet_Family_Soil_Type"] = test_soil_types.map(vanet_dict)
test_kdf["Catamount_Family_Soil_Type"] = test_soil_types.map(catamount_dict)
test_kdf["Leighan_Family_Soil_Type"] = test_soil_types.map(leighan_dict)
test_kdf["Bullwark_Family_Soil_Type"] = test_soil_types.map(bullwark_dict)
test_kdf["Como_Family_Soil_Type"] = test_soil_types.map(como_dict)
test_kdf["Moran_Family_Soil_Type"] = test_soil_types.map(moran_dict)
test_kdf["Other_Family_Soil_Type"] = test_soil_types.map(other_dict)

family_var = [
    "Ratake_Family_Soil_Type", "Vanet_Family_Soil_Type", "Catamount_Family_Soil_Type", "Leighan_Family_Soil_Type",
    "Bullwark_Family_Soil_Type", "Como_Family_Soil_Type", "Moran_Family_Soil_Type", "Other_Family_Soil_Type"
]
# disc_var += family_var

# Add rock soil types
soil_var = ["Soil_Type{}".format(i) for i in range(1, 41) if i != 15]

stony_soil_types = [1, 2, 6, 9, 12, 18, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40]
rubly_soil_types = [3, 4, 5, 10, 11, 13]
other_soil_types = [7, 8, 14, 15, 16, 17, 19, 20, 21, 22]

stony_dict = {i: 1 if i in stony_soil_types else 0 for i in range(1, 41)}
rubly_dict = {i: 1 if i in rubly_soil_types else 0 for i in range(1, 41)}
other_dict = {i: 1 if i in other_soil_types else 0 for i in range(1, 41)}

train_soil_types = train_df[soil_var].idxmax(axis=1).str[9:].astype(int)

train_kdf["Stony_Soil_Type"] = train_soil_types.map(stony_dict)
train_kdf["Rubly_Soil_Type"] = train_soil_types.map(rubly_dict)
# train_kdf["Other_Soil_Type"] = train_soil_types.map(other_dict)

test_soil_types = test_df[soil_var].idxmax(axis=1).str[9:].astype(int)

test_kdf["Stony_Soil_Type"] = test_soil_types.map(stony_dict)
test_kdf["Rubly_Soil_Type"] = test_soil_types.map(rubly_dict)
# test_kdf["Other_Soil_Type"] = test_soil_types.map(other_dict)

group_var = ["Stony_Soil_Type", "Rubly_Soil_Type"]
# disc_var += group_var

# Potential use of engineered variables in polynomial features
# kdf_pw = ["Horizontal_Distance_To_Roadways_Log",
#           "Elevation_Shifted_Vertical_Distance_To_Hydrology"]
kdf_pw = [
    "Horizontal_Distance_To_Roadways_Log", "Horizontal_Distance_To_Fire_Points_Log",
    "Elevation_Shifted_Vertical_Distance_To_Hydrology", "Elevation_Shifted_Horizontal_Distance_To_Hydrology"
]

## Polynomial features

In [None]:
# Create the sklearn handler
pf = PolynomialFeatures(degree=2, include_bias=False)

# Select base features
pw_var = ["Horizontal_Distance_To_Roadways", "Elevation", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area2"]

train_pw = train_df[pw_var].copy()
test_pw = test_df[pw_var].copy()

# Select knowledge domain features
if merge_kdf and len(kdf_pw) > 0:
    pw_var += kdf_pw
    train_pw.loc[:, kdf_pw] = train_kdf[kdf_pw]
    test_pw.loc[:, kdf_pw] = test_kdf[kdf_pw]

# Train polynomial features
pf.fit(train_pw)

# Transform the features
train_pw = pf.transform(train_pw)
test_pw = pf.transform(test_pw)

print("PolynomialFeatures shape: ", train_pw.shape)

# Re-create the training DataFrame
train_pw = pd.DataFrame(train_pw, columns=pf.get_feature_names(pw_var), index=train_df.index)

# Re-create the test DataFrame
test_pw = pd.DataFrame(test_pw, columns=pf.get_feature_names(pw_var), index=test_df.index)

# Create polynomial grown datasets
created_pw_var = [x for x in train_pw.columns if x not in kdf_pw+list(train_df.columns)]

## Merge the features

In [None]:
# Merging base and created features

if merge_kdf and merge_pw:
    
    # Add domain knowledge features
    train_df_with_kdf = pd.merge(train_df, train_kdf, left_index=True, right_index=True, how="left")
    test_df_with_kdf = pd.merge(test_df, test_kdf, left_index=True, right_index=True, how="left")

    # Add polynomial features
    train_df_final = pd.merge(train_df_with_kdf, train_pw[created_pw_var], left_index=True, right_index=True, how="left")
    test_df_final = pd.merge(test_df_with_kdf, test_pw[created_pw_var], left_index=True, right_index=True, how="left")

elif not merge_kdf and merge_pw:
    
    # Add polynomial features
    train_df_final = pd.merge(train_df, train_pw[created_pw_var], left_index=True, right_index=True, how="left")
    test_df_final = pd.merge(test_df, test_pw[created_pw_var], left_index=True, right_index=True, how="left")

elif merge_kdf and not merge_pw:

    # Add domain knowledge features
    train_df_final = pd.merge(train_df, train_kdf, left_index=True, right_index=True, how="left")
    test_df_final = pd.merge(test_df, test_kdf, left_index=True, right_index=True, how="left")
    
else:
    
    train_df_final = train_df.copy()
    test_df_final = test_df.copy()

## Drop specific features

In [None]:
drop_var = ["Soil_Type7", "Soil_Type9", "Soil_Type25"]
drop_var = []

for var in drop_var:
    
    if var in list(train_df_final.columns):

        train_df_final = train_df_final.drop(columns=[var])
        test_df_final = test_df_final.drop(columns=[var])

## Decorrelation

In [None]:
# Delete correlated features
threshold = 1

corr_cols = drop_corr_feat(train_df_final, test_df=test_df_final, corr_threshold=threshold)

## Scaling

In [None]:
# Create the datasets
x_train_decorr = train_df_final.drop(columns=label_var)
y_train = train_df_final[label_var].to_numpy().flatten()
x_test_decorr = test_df_final.drop(columns=label_var)
y_test = test_df_final[label_var].to_numpy().flatten()

print("Number of features: ", train_df_final.shape[1]-1)
print("Number of features (decorrelated): ", x_train_decorr.shape[1])

# Already selected features (previous runs)
select_var_idx = [
    1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 14, 15, 17, 21, 22, 23, 26, 28, 32, 36, 41, 45, 47, 49, 50, 51, 52, 54, 56, 58, 59, 60,
    61, 62, 63, 64, 65, 66, 67, 69, 72, 73, 75, 78, 80, 82, 83, 84, 90, 92, 93, 96, 97, 98, 100, 101, 103, 105, 106, 108
]
# selected_var = np.array(list(train_df_decorr.columns))[select_var_idx]
selected_var = x_train_decorr.columns

x_train = x_train_decorr[selected_var]
x_test = x_test_decorr[selected_var]

# Scale data
cont_var = [x for x in list(x_train.columns) if x not in disc_var]
sc = StandardScaler()
# sc = RobustScaler()
# x_train[cont_var] = sc.fit_transform(x_train[cont_var])
# x_test[cont_var] = sc.transform(x_test[cont_var])
x_train[x_train.columns] = sc.fit_transform(x_train[x_train.columns])
x_test[x_test.columns] = sc.transform(x_test[x_test.columns])

## PCA

In [None]:
pca_ratio=-1

if pca_ratio > 0:

    pca = PCA(n_components=pca_ratio, random_state=rs)

    # Fit the PCA
    pca.fit(x_train.to_numpy())

    # Transform the data (new columns)
    pca_col = ["PCA_{}".format(i) for i in range(1, pca.n_components_+1)]

    x_train = pd.DataFrame(pca.transform(x_train.to_numpy()), columns=pca_col,
                            index=x_train.index)
    x_test = pd.DataFrame(pca.transform(x_test.to_numpy()), columns=pca_col,
                           index=x_test.index)

    print("Number of features (after PCA): ", x_train.shape[1])

## Train test split

In [None]:
# Train test split (cross val)
x_train_tts, x_eval_tts, y_train_tts, y_eval_tts = train_test_split(
    x_train, y_train, test_size=0.2, random_state=seed, shuffle=True
)

# Try it!

### ExtraTreesClassifier

In [None]:
# Train / validation
etc = ExtraTreesClassifier(
    n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
    bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
)

etc.fit(x_train_tts, y_train_tts)
y_pred_tts = etc.predict(x_eval_tts)

print("Accuracy on selected features: ", accuracy_score(y_eval_tts, y_pred_tts))

In [None]:
# Submission
etc = ExtraTreesClassifier(
    n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
    bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
)

etc.fit(x_train, y_train)
y_pred = etc.predict(x_test)

print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### RandomForestClassifier

In [None]:
# Train / validation
rfc = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1, verbose=0)

rfc.fit(x_train_tts, y_train_tts)
y_pred_tts = rfc.predict(x_eval_tts)

print("Accuracy on train-test-split: ", accuracy_score(y_eval_tts, y_pred_tts))

In [None]:
# Submission
rfc = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1, verbose=0)

rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### LightGBM

In [None]:
def evalacc(y_true, y_pred):
    preds = y_pred.reshape(len(np.unique(y_true)), -1)
    preds = preds.argmax(axis = 0)
    return "Accuracy", accuracy_score(y_true, preds), True

# Train / validation
lgbm = LGBMClassifier(
    n_estimators=100, num_leaves=73, min_split_gain=1.22e-5, min_child_weight=3.82e-5, 
    min_child_samples=12, subsample=0.90, subsample_freq=2, reg_alpha=1.14e-6, reg_lambda=5.37e-5,
    random_state=rs, n_jobs=-1
)

lgbm.fit(
    x_train_tts, y_train_tts,
    eval_set=[(x_eval_tts, y_eval_tts), (x_train_tts, y_train_tts)],
    eval_names=["Validation", "Training"],
    verbose=10,
    eval_metric=["logloss", evalacc]
)
y_pred_tts = lgbm.predict(x_eval_tts)

print("Accuracy on selected features: ", accuracy_score(y_eval_tts, y_pred_tts))

In [None]:
lightgbm.plot_metric(lgbm, metric="Accuracy", figsize=(16, 8), title=None)
# plt.savefig("report/figures/lgbmeval.png", facecolor="white")
plt.show()

In [None]:
# Submission
lgbm = LGBMClassifier(
    n_estimators=177, num_leaves=73, min_split_gain=1.22e-5, min_child_weight=3.82e-5,
    min_child_samples=12, subsample=0.90, subsample_freq=2, reg_alpha=1.14e-6, reg_lambda=5.37e-5,
    random_state=rs, n_jobs=-1
)

lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)

print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### Stacking

In [None]:
# Training / validation
etc = ExtraTreesClassifier(
    n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
    bootstrap=False, ccp_alpha=1e-6, random_state=rs, n_jobs=-1, verbose=0
)
rfc = RandomForestClassifier(
    n_estimators=80, criterion="gini", min_samples_split=3, ccp_alpha=1e-5,
    random_state=rs, n_jobs=-1, verbose=0
)
lgbm = LGBMClassifier(
    n_estimators=100, num_leaves=70, min_split_gain=4e-5, min_child_weight=1.5e-5, min_child_samples=6,
    subsample=0.975, subsample_freq=6, reg_alpha=5.5e-4, reg_lambda=4.5e-4, random_state=rs, n_jobs=-1
)

est = [("rfc", rfc), ("extra", etc), ("lgbm", lgbm)]

final_est = LogisticRegression(multi_class="auto", solver="newton-cg")

stacking = StackingClassifier(
    est, final_estimator=final_est, cv=5, n_jobs=-1, verbose=1, stack_method="predict_proba", passthrough=True
)

stacking.fit(x_train_tts, y_train_tts)
y_pred_tts = stacking.predict(x_eval_tts)

print("Accuracy on selected features: ", accuracy_score(y_eval_tts, y_pred_tts))

In [None]:
# Submission
etc = ExtraTreesClassifier(
    n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
    bootstrap=False, ccp_alpha=1e-6, random_state=rs, n_jobs=-1, verbose=0
)
rfc = RandomForestClassifier(
    n_estimators=80, criterion="gini", min_samples_split=3, ccp_alpha=1e-5,
    random_state=rs, n_jobs=-1, verbose=0
)
lgbm = LGBMClassifier(
    n_estimators=100, num_leaves=70, min_split_gain=4e-5, min_child_weight=1.5e-5, min_child_samples=6,
    subsample=0.975, subsample_freq=6, reg_alpha=5.5e-4, reg_lambda=4.5e-4, random_state=rs, n_jobs=-1
)

est = [("rfc", rfc), ("extra", etc), ("lgbm", lgbm)]

final_est = LogisticRegression(multi_class="auto", solver="newton-cg")

stacking = StackingClassifier(
    est, final_estimator=final_est, cv=5, n_jobs=-1, verbose=2, passthrough=True
)

stacking.fit(x_train, y_train)
y_pred = stacking.predict(x_test)

print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### SVC

In [None]:
# Training / validation
svc = SVC(
    kernel="rbf", C=1000, gamma="scale", shrinking=True, decision_function_shape="ovr", break_ties=False,
    random_state=seed, verbose=True
)

svc.fit(x_train_tts, y_train_tts)
y_pred_tts = svc.predict(x_eval_tts)

print("Accuracy on selected features: ", accuracy_score(y_eval_tts, y_pred_tts))

### Catboost

In [None]:
# Training / validation
cat = CatBoostClassifier(iterations=5000, eval_metric="Accuracy", random_state=seed)

cat.fit(
    x_train_tts, y_train_tts, 
    eval_set=(x_eval_tts, y_eval_tts), verbose_eval=50
)
y_pred_tts = cat.predict(x_eval_tts)

print("Accuracy on selected features: ", accuracy_score(y_eval_tts, y_pred_tts))

In [None]:
# Submission
cat = CatBoostClassifier(iterations=5000, eval_metric="Accuracy", random_state=seed)

cat.fit(
    x_train, y_train,
    eval_set=(x_test, y_test), verbose_eval=50
)
y_pred = cat.predict(x_test)

print("Accuracy on selected features: ", accuracy_score(y_test, y_pred))

## Sample weights

Having a look at the submission and/or the data set easily proves that the data set is highly imbalanced. We can first try to set weights to the classes during training.

## Undersampling / Oversampling

Another well-used technique to cope with imbalanced data sets is to use undersampling, oversampling or a mix of both.

### SMOTE

In [None]:
n_smotes = 3

# SMOTE Oversampling
x_train_smote = x_train.copy()
y_train_smote = y_train.copy()

for k in range(n_smotes):

    # Fit an ExtraTreesClassifier as usual on whole training set
    etc = ExtraTreesClassifier(
        n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
        bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
    )
    etc.fit(x_train_smote, y_train_smote)

    # Estimate repartition of the labels in the test set
    y_pred = etc.predict(x_test)

    _, repart_pred = np.unique(y_pred, return_counts=True)

    repart_pred = repart_pred.astype(float)/len(y_test)
    min_class = np.argmin(repart_pred)+1

    repart_dict = {i: int(2160*repart_pred[i-1]/repart_pred[min_class-1]) for i in range(1, 8)}

    print("Estimated repartition in test set after {} SMOTE: {}.".format(k+1, repart_dict))

    # SMOTE
    smote = SMOTE(sampling_strategy=repart_dict, random_state=rs, n_jobs=-1)

    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

    # Submission
    etc = ExtraTreesClassifier(
        n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
        bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
    )
    etc.fit(x_train_smote, y_train_smote)
    y_pred = etc.predict(x_test)
    print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### Clustering + SMOTE

In [None]:
n_op = 2

# Undersampling/Oversampling
x_train_clust = x_train.copy()
y_train_clust = y_train.copy()

for k in range(n_op):

    # Fit an ExtraTreesClassifier as usual on whole training set
    etc = ExtraTreesClassifier(
        n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
        bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
    )
    etc.fit(x_train_clust, y_train_clust)

    # Estimate repartition of the labels in the test set
    y_pred = etc.predict(x_test)

    _, repart_pred = np.unique(y_pred, return_counts=True)

    repart_pred = repart_pred.astype(float)/len(y_test)
    min_class = np.argmin(repart_pred)+1
    maj_class = np.argmax(repart_pred)+1
    
    n_souhaite = 15120
    
    repart_dict = {i: int(repart_pred[i-1]*n_souhaite) for i in range(1, 8)}
    
    repart_clust = {i: min(2160, repart_dict[i]) for i in range(1, 8)}
    
    print("Estimated repartition in test set after {} undersampling: {}.".format(k+1, repart_clust))
    print("Estimated repartition in test set after {} oversampling: {}.".format(k+1, repart_dict))
    
    # ClusteringCentroids + SMOTE
    kmeans = KMeans(random_state=rs)
    clust = ClusterCentroids(sampling_strategy=repart_clust, random_state=rs, estimator=kmeans)

    x_train_clust, y_train_clust = clust.fit_resample(x_train, y_train)
    
    smote = SMOTE(sampling_strategy=repart_dict, random_state=rs, n_jobs=-1)
    
    x_train_clust, y_train_clust = smote.fit_resample(x_train_clust, y_train_clust)

    # Submission
    etc = ExtraTreesClassifier(
        n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
        bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
    )
    etc.fit(x_train_clust, y_train_clust)
    y_pred = etc.predict(x_test)
    print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

### Manual sampling

In [None]:
# Manual resampling
x_train_hand = x_train.copy()
y_train_hand = y_train.copy()

repart_dict = {1: 5500, 2: 7000, 3: 1000, 4: 50, 5: 450, 6: 500, 7: 650}
repart_clust = {i: min(2160, repart_dict[i]) for i in range(1, 8)}

print("Estimated repartition in training set after operations: {}.".format(repart_dict))

# ClusteringCentroids + SMOTE
kmeans = KMeans(random_state=rs)
clust = ClusterCentroids(sampling_strategy=repart_clust, random_state=rs, estimator=kmeans)

x_train_hand, y_train_hand = clust.fit_resample(x_train, y_train)

smote = SMOTE(sampling_strategy=repart_dict, random_state=rs, n_jobs=-1)

x_train_hand, y_train_hand = smote.fit_resample(x_train_hand, y_train_hand)

# Submission
etc = ExtraTreesClassifier(
    n_estimators=300, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=1e-7,
    bootstrap=False, ccp_alpha=1e-6, random_state=seed, n_jobs=-1, verbose=0
)
etc.fit(x_train_hand, y_train_hand)
y_pred = etc.predict(x_test)
print("Accuracy on test set: ", accuracy_score(y_test, y_pred))