In this notebook I will train a XGBoost model with random search for tuning the model.

I will also use the insights gained from the [EDA](https://www.kaggle.com/aniketsharma00411/tpsdec21-exploratory-data-analysis/) here.

My other notebooks:
 - [Exploratory Data Analysis](https://www.kaggle.com/aniketsharma00411/tpsdec21-exploratory-data-analysis/)
 - [Sample Submission](https://www.kaggle.com/aniketsharma00411/tpsdec21-sample-submission/)
 - [Boilerplate code](https://www.kaggle.com/aniketsharma00411/tpsdec21-dummy/)

# Initialization

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

from itertools import product
import random

In [None]:
random.seed(42)

# Preparing data

In [None]:
def label_converter(label):
    conversion = {
        1: 0,
        2: 1,
        3: 2,
#         4: 3,
        6: 3,
        7: 4
    }
    
    return conversion[label]

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', index_col='Id').reset_index(drop=True)

# class5_index = None
indices_to_remove = []
for index, val in enumerate(df['Cover_Type']):
    if val == 5 or val == 4:
#         class5_index = index
        indices_to_remove.append(index)

df = df.drop(indices_to_remove)

X = df.drop(['Cover_Type'], axis=1)
y = df['Cover_Type'].apply(label_converter)

In [None]:
X = X.drop(['Soil_Type7', 'Soil_Type15'], axis=1)
numerical = []
categorical = []
for col in X.columns:
    if X[col].nunique() <= 2:
        categorical.append(col)
    else:
        numerical.append(col)
        
gaussian_features = ['Elevation', 'Hillshade_3pm']
for feature in gaussian_features:
    numerical.remove(feature)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
X_test = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', index_col='Id')
X_test = X_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

# Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_gaussian_transformer = SimpleImputer(strategy='mean')
    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = SimpleImputer(strategy='most_frequent')

    preprocessor = ColumnTransformer(
    transformers=[
        ('gaussian', numerical_gaussian_transformer, gaussian_features),
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    scaler = StandardScaler()

    model = XGBClassifier(
        use_label_encoder=False,
        objective='multi:softmax',
        n_jobs=-1,
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessor', preprocessor),
           ('scaler', scaler),
           ('model', model)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [None]:
search_space = {
    'n_estimators': [10, 20, 50],
    'max_depth': np.linspace(1, 9, num=5).astype('int'),
    'learning_rate': np.logspace(-3, 1, num=5),
    'reg_alpha': np.linspace(0, 1, num=3),
    'reg_lambda': np.linspace(0, 1, num=3)
}

In [None]:
NUM_POSS = 15

In [None]:
max_score = 0
best_params = {}

for val in random.choices(list(product(*search_space.values())), k=NUM_POSS):
    params = {}
    for i, param in enumerate(search_space.keys()):
        params[param] = val[i]
    print(params)

    clf = create_new_pipeline(params)

    clf.fit(X_train, y_train)

    score = clf.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_params = params
    print(f'Score: {score}\tBest score: {max_score}')

In [None]:
best_params

In [None]:
max_score

# Training Final Model

In [None]:
clf = create_new_pipeline(best_params)

In [None]:
clf.fit(X, y)

In [None]:
clf.score(X, y)

# Making Predictions

In [None]:
def reverse_label_converter(label):
    conversion = {
        0: 1,
        1: 2,
        2: 3,
#         3: 4,
        3: 6,
        4: 7
    }
    
    return conversion[label]

In [None]:
submission = pd.DataFrame(
    {'Id': X_test.index,
     'Cover_Type': map(reverse_label_converter, clf.predict(X_test))}
)

submission

In [None]:
submission.to_csv('submission.csv', index=False)