In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

In [None]:
train['Cover_Type'].value_counts() 

In [None]:
train=train[train['Cover_Type']!=5]

In [None]:
train.head()

In [None]:
# Get train data without the target and ids
X = train.iloc[:, 1:-1].copy()
# Get the target
y = train.Cover_Type.copy()

# Create test X, drop ids.
test_X = test.iloc[:, 1:].copy()

In [None]:
X.head()

In [None]:
# Some features are constant, let's remove them
drop_cols = [col for col in X.columns if X[col].nunique() == 1]
X = X.drop(columns=drop_cols)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
import gc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
import time

In [None]:
# Splitting the data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=5)

del X
gc.collect()

X_train

In [None]:
# Let's make two lists, one with only one-hot-encoded features, the other with the rest
oh_cols = [col for col in X_train.columns if X_train[col].nunique()==2]
n_oh_cols = [col for col in X_train.columns if X_train[col].nunique()>2]

# this should be zero
len(n_oh_cols)+len(oh_cols)-len(X_train.columns) 

In [None]:
# Normalising the non-enconded features
sc = StandardScaler()
prep = ColumnTransformer([('sc', sc, n_oh_cols)], remainder='passthrough')

# Define model
model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='merror', random_state=7)

# Define pipeline
pipe = Pipeline(steps=[('preprocessing', prep),('model', model)])

In [None]:
# The target needs to be properly enconded, i.e. give 0, 1, 2,...
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train))

In [None]:
# now, we fit using the role train data and the best parameters in the scan
model_opt = XGBClassifier(80, eval_metric='error', use_label_encoder=False, tree_method='gpu_hist')

# Defining the pipeline with the same preprocessing as before, but with the tuned model
pipe_opt = Pipeline(steps=[('preprocessing', prep), ('model', model_opt)])

#transform y_val according to the encoding applied to y_train
y_test = pd.Series(le.transform(y_test))
# Fitting the whole dataset
pipe_opt.fit(X_test, y_test)

del X_test
del y_test
gc.collect()

In [None]:
### We calculate and store the probability of the positive prediction
X_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

# drop the columns dropped in the train set
X_test = X_test.drop(columns=drop_cols)

pred_test = pipe_opt.predict(X_test)
# inverse transform the results to the original enconding and submit
pred_test = pd.Series(le.inverse_transform(pred_test))

output = pd.DataFrame({'Id': X_test.index,
                       'Cover_Type': pred_test})
output

In [None]:
output.to_csv('submission.csv', index=False)