In [None]:
%pip install optuna
%pip install xgboost

In [None]:
import os

# Import Packages
import numpy as np 
import pandas as pd

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

In [None]:
# Setting paths
root_path = '/kaggle/input/tabular-playground-series-jun-2021'
train_path = os.path.join(root_path, 'train.csv')
test_path = os.path.join(root_path, 'test.csv')
sample_sub_path = os.path.join(root_path, 'sample_submission.csv')

In [None]:
# Set seed
seed = 10

In [None]:
# Read training data
df_train = pd.read_csv(train_path)

# Convert classes to numeric
df_train['target'] = df_train['target'].str[-1]
df_train['target'] = pd.to_numeric(df_train['target'])
df_train['target'] = df_train['target'] - 1

In [None]:
# Target histogram to check class distribution
df_train['target'].hist()

In [None]:
# X and y for training set
X = df_train.iloc[:, 1:-1]
y = df_train[['target']]

# Check the shape of the dataset
print(f'Training set shape: {X.shape}')

In [None]:
# Converting to DMatrix
dtrain = xgb.DMatrix(X, label=y)

So we've got 200 000 rows and 75 features!

In [None]:
# Encoding the target variable 
# y = pd.get_dummies(y)
# print(f'New shape: {y.shape}')

In [None]:
'''# Creating validation split
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=seed)

# Checking split shapes
print(f'X_train shape: {X_train.shape}\nX_test shape: {X_val.shape}')'''

In [None]:
'''# Hyperparameter space
ls_estimators = [500, 1000, 1500, 3000, 5000]
ls_min_samples_leaf = list(range(1, 51))
ls_max_depth = [None] + list(range(1, 11))


# Parameter grid
param_grid = {'n_estimators':ls_estimators,
             'min_samples_leaf': ls_min_samples_leaf,
             'max_depth':ls_max_depth}'''

In [None]:
'''# Creating the Random Forest
rf = RandomForestClassifier(random_state=seed,
                            n_estimators=3000,
                            max_depth=5,
                            min_samples_leaf=5)'''

In [None]:
# Creating the XGB Classifier
xg_c = xgb.XGBClassifier(objective='multi:softmax',
                       seed=seed,
                       num_class=9,
                       use_label_encoder=False,
                       eval_metric='logloss')

In [None]:
# Create the parameter grid: gbm_param_grid
xg_c_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [1000, 2500, 3500],
    'max_depth': [2, 3, 4, 5]
}

# Perform random search: grid_mse
xg_c_cv = RandomizedSearchCV(param_distributions=xg_c_param_grid,
                                    estimator=xg_c,
                                    scoring='accuracy',
                                    n_iter=4,
                                    cv=4,
                                    verbose=1)


# Fit randomized_mse to the data
xg_c_cv.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", xg_c_cv.best_params_)
print("Highest accuracy found: ", np.sqrt(np.abs(xg_c_cv.best_score_)))

In [None]:
# Predict probabilities for each class
y_pred = xg_c_cv.predict_proba(X)

In [None]:
# Evaluation
logloss = log_loss(y, y_pred)
print(f'Log loss: {logloss}')

In [None]:
# Reading test data
df_test = pd.read_csv(test_path)

# Creating testing set
X_test = df_test.iloc[:, 1:]

In [None]:
# Creating predictions to be submitted
predictions = xg_c_cv.predict_proba(X_test)
sub = pd.DataFrame(predictions, columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
sub = pd.concat([df_test['id'], sub], axis=1)
sub.head()

# Creating submission
sub.to_csv('submission.csv', index=False)