# Importing Libraries and Loading datasets

In [None]:
import numpy as np
import pandas as pd

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.stats import mode

# XGBoost
from xgboost import XGBClassifier

# CatBoost
from catboost import CatBoostClassifier

# Cross-Validation
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
# This code snippet is taken from https://www.kaggle.com/desalegngeb/december-2021-tps-eda-models
# Originally https://www.kaggle.com/c/tabular-playground-series-oct-2021/discussion/275854
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Explore Data

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Columns: \n{0}".format(list(train.columns)))

# Basic Data Check

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

## Missing values

In [None]:
missing_values_train = train.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Duplicates

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

# Features

## Categorical Features

In [None]:
#variables = train.nunique().sort_values(ascending=True)
#print('Categorical variables in train data: \n{0}'.format(variables))
categorical_features = train.columns[11:-1:]
print("Categorical Columns: \n{0}".format(list(categorical_features)))

## Numerical Features

In [None]:
numerical_features = train.columns[1:11]
print("Numerical Columns: \n{0}".format(list(train.columns[1:11])))
train[numerical_features].describe()

## Target Distribution

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Target distribution')
sns.countplot(x=train['Cover_Type'], data=train)

## Dropping rows and columns

In [None]:
cType5 = train[train['Cover_Type'] == 5].index
print("Number of rows with Cover_Type = 5: {0}".format(len(cType5)))

In [None]:
print("Unique values in Soil_Type7 column train data: {0}".format(train['Soil_Type7'].unique()))
print("Unique values in Soil_Type15 column train data: {0}".format(train['Soil_Type15'].unique()))

print("Unique values in Soil_Type7 column test data: {0}".format(test['Soil_Type7'].unique()))
print("Unique values in Soil_Type15 column test data: {0}".format(test['Soil_Type15'].unique()))

In [None]:
# Dropping the row Cover_Type = 5,
# causes problems during kfold (least populated class)
train.drop(cType5, axis=0, inplace=True)

# Dropping columns Soil_Type7 and Soil_Type15, they are zero
train.drop(['Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)
test.drop(['Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)

## Encoding labels

In [None]:
encoder = LabelEncoder()
train["Cover_Type"] = encoder.fit_transform(train["Cover_Type"])

# Modelling

| Version | Selected Model | Parameters                                             | Accuracy score          | Public Score | Notes                              |
| ------- | -----------    | ------------------------------------------------------ | ----------------------- | -----------  | ---------------------------------- |
|    2    | XGBClassifier  | learning_rate=0.01, gamma=0.0, max_depth=5             | 0.914338                | 0.91626      |                                    |
|    3    | XGBClassifier  | learning_rate=0.5, gamma=1.0, max_depth=8              | 0.960802                | 0.95400      |                                    |
|    4    | XGBClassifier  | learning_rate=0.3, gamma=1.6, max_depth=10             | 0.961175                | 0.95392      |                                    |
|    6    | XGBClassifier  | learning_rate=0.5, gamma=1.0, max_depth=8              | 0.960802                | 0.95426      | Cross validation with Version 3    |
|    9    | XGBClassifier  | reg_alpha=0.0, reg_lambda=0.1, n_estimators=100        | 0.960901                | 0.95445      | Additional parameters to Version 3 |
|    10   | XGBClassifier  | reg_alpha=5, reg_lambda=10, n_estimators=150           | 0.965863                | 0.95381      | Using GridSearchCV with Version 3  |

In [None]:
# Get train data without the target and ids
X = train.iloc[:, 1:-1].copy()
# Get the target
y = train.Cover_Type.copy()
# Get the test data without ids
test_X = test.iloc[:, 1:]

# It takes time to handle all of the data.
# So, I am using a smaller portion of the data
# while debugging/testing.
#X = train.iloc[0:50, 1:-1].copy()
#y = train.Cover_Type[0:50].copy()
#test_X = test.iloc[0:50, 1:]

In [None]:
# To store models created
best_models = {}

# Split data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

def print_best_parameters(hyperparameters, best_parameters):
    value = "Best parameters: "
    for key in hyperparameters:
        value += str(key) + ": " + str(best_parameters[key]) + ", "
    if hyperparameters:
        print(value[:-2])

def get_best_model(estimator, param_grid, fit_params):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=cv)
    best_model = grid_search.fit(train_X, train_y, **fit_params)
    best_parameters = best_model.best_estimator_.get_params()
    print_best_parameters(param_grid, best_parameters)
    return best_model

def evaluate_model(model, name):
    print("Accuracy score:", accuracy_score(train_y, model.predict(train_X)))
    best_models[name] = model

# [XGBClassifier](https://xgboost.readthedocs.io/en/stable/parameter.html)

* **eta [default=0.3, alias: learning_rate]**  
    * Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
    * range: [0,1]


* **gamma [default=0, alias: min_split_loss]**  
    * Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
    * range: [0,∞]


* **max_depth [default=6]**  
    * Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguide growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
    * range: [0,∞] (0 is only accepted in lossguide growing policy when tree_method is set as hist or gpu_hist)


* **lambda [default=1, alias: reg_lambda]**  
    L2 regularization term on weights. Increasing this value will make model more conservative.

* **alpha [default=0, alias: reg_alpha]**  
    L1 regularization term on weights. Increasing this value will make model more conservative.

In [None]:
param_grid = {
    'gamma'         : [0.4, 0.8, 1.6, 3.2, 6.4],
    'learning_rate' : [0.1, 0.2, 0.3, 0.5, 1],
    'max_depth'     : [8, 9, 10, 11, 12],
    'reg_alpha'     : [0, 0.1, 0.2, 0.5, 1, 2, 5, 10],
    'reg_lambda'    : [0, 0.1, 0.2, 0.5, 1, 2, 5, 10],
    'n_estimators'  : [50, 100, 150]
}

In [None]:
# https://towardsdatascience.com/binary-classification-xgboost-hyperparameter-tuning-scenarios-by-non-exhaustive-grid-search-and-c261f4ce098d
# Best parameters found so far.
param_grid = {
    'learning_rate' : [0.5],
    'gamma'         : [1.0],
    'max_depth'     : [8],
    'reg_alpha'     : [5],
    'reg_lambda'    : [10],
    'n_estimators'  : [150]
}
fit_params = {
    'verbose'               : False,
    'early_stopping_rounds' : 40,
    'eval_metric'           : 'mlogloss',
    'eval_set'              : [(val_X, val_y)]
}
#estimator = XGBClassifier(seed=1, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False)
#best_model_xgbc = get_best_model(estimator, param_grid, fit_params)

In [None]:
#evaluate_model(best_model_xgbc.best_estimator_, 'XGBClassifier')

# [CatBoostClassifier](https://catboost.ai/en/docs/references/training-parameters/)

* **depth, Alias: max_depth**  
Depth of the tree.

* **iterations, Aliases: num_boost_round, n_estimators, num_trees**  
The maximum number of trees that can be built when solving machine learning problems.


* **learning_rate, Alias: eta**  
The learning rate.
Used for reducing the gradient step.


* **l2_leaf_reg, Alias: reg_lambda**  
Coefficient at the L2 regularization term of the cost function.

* **border_count, Alias: max_bin**  
The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively.

In [None]:
param_grid = {
    'depth'            : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'iterations'       : [100, 250, 500, 1000],
    'learning_rate'    : [0.001, 0.01, 0.03, 0.1, 0.2, 0.3], 
    'l2_leaf_reg'      : [1, 3, 5, 10, 100],
    'border_count'     : [5, 10, 20, 32, 50, 100, 200]
}

In [None]:
# https://effectiveml.com/using-grid-search-to-optimise-catboost-parameters.html
# https://catboost.ai/en/docs/concepts/parameter-tuning
param_grid = {
#    'depth'            : [6, 8, 10],
#    'iterations'       : [100, 250, 500],
#    'learning_rate'    : [0.01, 0.1]
}
fit_params = {
    'verbose'               : False,
    'early_stopping_rounds' : 40,
    'eval_set'              : [(val_X, val_y)]
}
estimator = CatBoostClassifier(random_seed=1, objective='MultiClass', task_type='GPU', devices='0')
best_model_cat = get_best_model(estimator, param_grid, fit_params)

In [None]:
evaluate_model(best_model_cat.best_estimator_, 'CatBoostClassifier')

# Submission

In [None]:
# Get predictions for each model and create submission files
for model in best_models:
    predictions = best_models[model].predict(test_X)
    predictions = encoder.inverse_transform(predictions)
    output = pd.DataFrame({'Id': test.Id, 'Cover_Type': predictions})
    output.to_csv('submission_' + model + '.csv', index=False)