In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
import gc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
import time

In [None]:
# get the train data set
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv', index_col='Id')
# Load the test data set only when needed, to avoid using up too much memory
#test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

In [None]:
X = train.drop(columns='Cover_Type')
y = train['Cover_Type']

del train
gc.collect()

X

In [None]:
# finding how many classes in the target and how often they appear
b = []
for i in y.unique():
    a = y==i
    b.append(a.sum())
    
y.unique(),b

In [None]:
# So the entry 5 appears only once in y, let's remove it.
r_ind = y[y==5].index[0]
y=y.drop(labels=r_ind)
X=X.drop(labels=r_ind)

y.unique(),X.shape

In [None]:
X.isnull().sum(), X.dtypes, X.nunique()

In [None]:
# Some features are constant, let's remove them
drop_cols = [col for col in X.columns if X[col].nunique() == 1]
X = X.drop(columns=drop_cols)
X

In [None]:
# Splitting the data set
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.4, test_size=0.6, stratify=y, random_state=7)

del X
gc.collect()

X_train

In [None]:
y_val.unique(), y_train.unique()

In [None]:
# Let's make two lists, one with only one-hot-encoded features, the other with the rest
oh_cols = [col for col in X_train.columns if X_train[col].nunique()==2]
n_oh_cols = [col for col in X_train.columns if X_train[col].nunique()>2]

# this should be zero
len(n_oh_cols)+len(oh_cols)-len(X_train.columns) 

In [None]:
# Normalising the non-enconded features
sc = StandardScaler()
prep = ColumnTransformer([('sc', sc, n_oh_cols)], remainder='passthrough')

# Define model
model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='merror', random_state=7)

# Define pipeline
pipe = Pipeline(steps=[('preprocessing', prep),('model', model)])

In [None]:
# The target needs to be properly enconded, i.e. give 0, 1, 2,...
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train))

In [None]:
# Let's scan through different sets of hyperparameters using BayesSearchScan
search_spaces = {'model__n_estimators': Integer(400, 1200),'model__learning_rate': Real(0.006, 0.21, 'log-uniform'), 'model__max_depth': Integer(3, 12), 
                 'model__subsample': Real(0.1, 1, 'log-uniform')}

# for cross validation with 5 splits, using StratifiedKFold to keep the same percentage of sample per each class
# skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

# defining a Bayes scan, n_iter=50 (picks 50 scenarios), using 'accuracy' and the scoring method
search_bay = BayesSearchCV(pipe, search_spaces, n_iter=20, scoring='accuracy', cv=5, random_state=7)
search_bay.total_iterations

In [None]:
# start counting how long the fitting takes
start = time.time()

# fit with the whole dataset
result_bay = search_bay.fit(X_train, y_train)

del X_train
del y_train
gc.collect()


# print the best score and parameters found during the scan
print("(Bayes) Best: %f using %s" % (result_bay.best_score_, result_bay.best_params_))
elapsed = time.time() - start
print("Time to run the scan: %f" % (elapsed))

In [None]:
# In order to use b_result.best_params_ in the next model, we need to remove "model__" from the keys
best_parameters_bay = dict(result_bay.best_params_.copy())

for k in best_parameters_bay.keys():
    best_parameters_bay[k.replace("model__","")] = best_parameters_bay.pop(k)

# for some reason, the loop is not replacing two model__ instances, so we do it one by one
best_parameters_bay['n_estimators'] = best_parameters_bay.pop('model__n_estimators')
best_parameters_bay['subsample'] = best_parameters_bay.pop('model__subsample')

best_parameters_bay

In [None]:
# now, we fit using the role train data and the best parameters in the scan
model_opt = XGBClassifier(**best_parameters_bay, eval_metric='error', use_label_encoder=False, tree_method='gpu_hist')

# Defining the pipeline with the same preprocessing as before, but with the tuned model
pipe_opt = Pipeline(steps=[('preprocessing', prep), ('model', model_opt)])

#transform y_val according to the encoding applied to y_train
y_val = pd.Series(le.transform(y_val))
# Fitting the whole dataset
pipe_opt.fit(X_val, y_val)

del X_val
del y_val
gc.collect()

In [None]:
### We calculate and store the probability of the positive prediction
X_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

# drop the columns dropped in the train set
X_test = X_test.drop(columns=drop_cols)

pred_test = pipe_opt.predict(X_test)
# inverse transform the results to the original enconding and submit
pred_test = pd.Series(le.inverse_transform(pred_test))

output = pd.DataFrame({'Id': X_test.index,
                       'Cover_Type': pred_test})
output.to_csv('submission_TPS-21-21.csv', index=False)

output

In [None]:
# # fit and predict
# pipe.fit(X_train, y_train)

# pred = pipe.predict(X_val)

# # to compare we need to transform y_val according to the label enconding on y_train
# y_val = pd.Series(le.transform(y_val))

# score =  accuracy_score(y_val,pred)
# score

In [None]:
# # competition score, 1st model, 0.95224 with train_size=0.8

# # load the test set
# X_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

# # drop the columns dropped in the train set
# X_test = X_test.drop(columns=drop_cols)
# # predict results
# pred_test = pipe.predict(X_test)

# # inverse transform the results to the original enconding and submit
# pred_test = pd.Series(le.inverse_transform(pred_test))

# output = pd.DataFrame({'Id': X_test.index,
#                        'Cover_Type': pred_test})
# output.to_csv('submission_TPS-12-21.csv', index=False)


# output