In this notebook I will create a boilerplate code for the competition. I will use the insights gained from the [EDA](https://www.kaggle.com/aniketsharma00411/tpsdec21-exploratory-data-analysis/) here.

My other notebooks:
 - [Exploratory Data Analysis](https://www.kaggle.com/aniketsharma00411/tpsdec21-exploratory-data-analysis/)
 - [Sample Submission](https://www.kaggle.com/aniketsharma00411/tpsdec21-sample-submission/)

# Initialization

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier

from itertools import product

# Preparing data

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', index_col='Id').reset_index(drop=True)

class5_index = None
for index, val in enumerate(df['Cover_Type']):
    if val == 5:
        class5_index = index

df = df.drop([class5_index])

X = df.drop(['Cover_Type'], axis=1)
y = df['Cover_Type']

In [None]:
X = X.drop(['Soil_Type7', 'Soil_Type15'], axis=1)
numerical = []
categorical = []
for col in X.columns:
    if X[col].nunique() <= 2:
        categorical.append(col)
    else:
        numerical.append(col)
        
gaussian_features = ['Elevation', 'Hillshade_3pm']
for feature in gaussian_features:
    numerical.remove(feature)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
X_test = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', index_col='Id')
X_test = X_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

# Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_gaussian_transformer = SimpleImputer(strategy='mean')
    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = SimpleImputer(strategy='most_frequent')

    preprocessor = ColumnTransformer(
    transformers=[
        ('gaussian', numerical_gaussian_transformer, gaussian_features),
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    scaler = StandardScaler()

    model = DummyClassifier(
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessor', preprocessor),
           ('scaler', scaler),
           ('model', model)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [None]:
search_space = {
    'strategy': ['stratified', 'most_frequent', 'prior', 'uniform']
}

In [None]:
max_score = 0
best_params = {}

for val in product(*search_space.values()):
    params = {}
    for i, param in enumerate(search_space.keys()):
        params[param] = val[i]
    print(params)

    clf = create_new_pipeline(params)

    clf.fit(X_train, y_train)

    score = clf.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_params = params
        print(f'Best score: {score}')

In [None]:
best_params

In [None]:
max_score

# Training Final Model

In [None]:
clf = create_new_pipeline(best_params)

In [None]:
clf.fit(X, y)

In [None]:
clf.score(X, y)

# Making Predictions

In [None]:
submission = pd.DataFrame(
    {'Id': X_test.index,
     'Cover_Type': clf.predict(X_test)}
)

submission

In [None]:
submission.to_csv('submission.csv', index=False)