# XGBoost Implementation: 1.75670

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import all the requisite sklearn packages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Data Loading and Formatting
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
features = ['feature_'+str(i) for i in range(71)]

# Label Encoder to transform the targets to numerical values for use in XGB
le = LabelEncoder()

# Standard Scaler for use with PCA
ss = StandardScaler()
train['target'] = le.fit_transform(train['target'])
train_pca = pd.DataFrame(columns=features,index=range(200000))
train_pca.loc[:,features] = ss.fit_transform(train[features])
train.head()

In [None]:
# PCA - Shows little covariance between the different features
pca = PCA()
data = pca.fit_transform(train_pca[features])
train_pca[features]=data
pca.explained_variance_ratio_

In [None]:
# Creating data sets for all possible uses
# Train test splits for impromptu testing
X_train,X_test,y_train,y_test = train_test_split(train[features],train['target'])

# DataMatrices for using XGB without the SKLearn wrapper
dtrain = xgb.DMatrix(data=train[features],label=train['target'])
dtest = xgb.DMatrix(data=X_test)

# Sanity Check
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
# Cross Validation - Implemented in Colab using GPU

# estimator = xgb.XGBClassifier(num_class=9,objective='multi:softmax',use_label_encoder=False,verbosity=1)
# grid = {'max_depth': [1, 5, 10, 15, 20],
#        'learning_rate': np.logspace(-3,1,5),
#        'n_estimators': [100, 200, 300, 500, 700],
#        'reg_lambda': np.append(np.logspace(-4,0,5),[0]),
#        'reg_alpha': np.append(np.logspace(-4,0,5),[0])}
# clf = RandomizedSearchCV(estimator,grid,verbose=4,cv=3)
# clf.fit(train[features],train['target'])

In [None]:
# Fitting a model using the best parameters found in the random grid search
# This may take a while without a GPU :(
"""
Best Parameters:
{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 300,
 'reg_alpha': 0.01,
 'reg_lambda': 1.0}
 {reg_lambda=0.0001,
 reg_alpha=0.0001,
 n_estimators=300,
 max_depth=2,
 learning_rate=0.1}
 learning_rate=0.1,max_depth=5,n_estimators=100,reg_alpha=0.01,reg_lambda=0.1
"""
model = xgb.XGBClassifier(num_class=9,
                          objective='multi:softmax',
                          use_label_encoder=False,
                          learning_rate=0.1,
                          max_depth=5,
                          n_estimators=100,
                          reg_alpha=0.01,
                          reg_lambda=0.1,
                          verbosity=0)

# Fit the model, you can change the 
model.fit(train[features],train['target'])

In [None]:
# Plotting a confusion matrix to visualize the training results

y_pred = model.predict(train[features])

cm = confusion_matrix(train['target'], y_pred, labels=[i for i in range(9)])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=[i for i in range(9)])

disp.plot()

In [None]:
# Visualize the training label distribution
plt.hist(train['target'],bins=9);

In [None]:
# Create Submission Predictions and save them
test_pred = model.predict_proba(test[features])
submission = pd.DataFrame()
submission['id'] = test['id']
submission[['Class_'+str(i) for i in range(1,10)]] = test_pred
submission.to_csv('submission.csv',index=False)

# Display Submitted Label distributions
pred_vals = model.predict(test[features])
plt.hist(pred_vals,bins=9);

In [None]:
# Compare to a ~very~ simple NN

"""
The MLP classifier should perform a little worse
than the tuned XGB Classifier
based on the constriants of the presented parameter grid.
"""

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(9,9,9))
model.fit(X_train,y_train)
model.score(X_test,y_test)