In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pg_comp_train_filepath = "../input/tabular-playground-series-jun-2021/train.csv"

pg_train_data = pd.read_csv(pg_comp_train_filepath)

In [None]:
print(pg_train_data.info())
print("\n")
print(pg_train_data.shape)

In [None]:
pg_train_data.head()

In [None]:
fig = plt.figure(figsize=(10, 6))
sns.countplot(x="target", data=pg_train_data)

In [None]:
train_data_corr = pg_train_data.corr()
mask = np.triu(np.ones_like(train_data_corr, dtype=np.bool))

fig = plt.figure(figsize=(16,10))
sns.heatmap(train_data_corr, mask=mask)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Label encoding target values from objects to ints

label_encoder = LabelEncoder()

pg_train_data['target'] = label_encoder.fit_transform(pg_train_data['target'])

In [None]:
X = pg_train_data.drop(['id', 'target'], axis=1)
y = pg_train_data.target

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
# XGB Model

'''
Best parameters so far:
n_estimators=500
learning_rate=0.03
max_depth=4
colsample_bytree=0.6000000000000001
colsample_bylevel=0.30000000000000004


xgb_model = XGBClassifier(use_label_encoder=True, n_estimators=500, learning_rate=0.03, max_depth=4, 
                          colsample_bytree=0.6000000000000001, colsample_bylevel=0.30000000000000004)

'''


In [None]:

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

xgbc = XGBClassifier(seed=20)

params = { "max_depth": [2, 3, 4],
           "learning_rate": [0.03, 0.04, 0.05],
           "n_estimators": np.arange(100, 1000, 100),
           "colsample_bytree": np.arange(0.2, 0.7, 0.1),
            "colsample_bylevel": np.arange(0.2, 0.7, 0.1),
            "colsample_bynode": np.arange(0.2, 0.7, 0.1)
            }

clf = RandomizedSearchCV(estimator=xgbc,
                  param_distributions=params,
                  scoring='accuracy',
                   n_iter=75,
                  verbose=1)


In [None]:
'''
Best parameters:
early_stopping_rounds=5
eval_set[(X_val, y_val)]
verbose=False

xgb_model.fit(X_train, y_train, 
              early_stopping_rounds=10, 
              eval_set=[(X_val, y_val)],
             verbose=False)
trained_xgb = xgb_model.predict(X_val)  

'''
clf.fit(X_train, y_train)

print("Best parameters:", clf.best_params_)
#print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))


In [None]:
print(trained_xgb)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print("Mean absolute error: {}\n".format(mean_absolute_error(y_val, trained_xgb)))
print("r2 score: {}".format(r2_score(y_val, trained_xgb)))
accuracy = accuracy_score(y_val, trained_xgb)
print("Accuracy: {}".format(accuracy * 100))

In [None]:
pg_comp_test_filepath = "../input/tabular-playground-series-jun-2021/test.csv"
pg_test_data = pd.read_csv(pg_comp_test_filepath)

In [None]:
pg_test_data.head()

In [None]:
print(pg_test_data.info())
print("\n")
print(pg_test_data.shape)

In [None]:
test_data = pg_test_data.drop('id', axis=1)
#test_data = test_data.reset_index()

In [None]:
display(test_data)


In [None]:
'''
from mlxtend.preprocessing import minmax_scaling

test_df = minmax_scaling(test_data, columns=[0])
'''

In [None]:
final_test = xgb_model.predict(test_data)

In [None]:
val = y[:100000]

In [None]:
print("Mean absolute error: {}\n".format(mean_absolute_error(val, final_test)))
print("r2 score: {}".format(r2_score(val, final_test)))
accuracy = accuracy_score(val, final_test)
print("Accuracy: {}".format(accuracy * 100))

In [None]:
X_test = test_data

test_preds = xgb_model.predict_proba(X_test)
export = pd.DataFrame(test_preds, columns=[
    'Class_1',
    'Class_2',
    'Class_3',
    'Class_4',
    'Class_5',
    'Class_6',
    'Class_7',
    'Class_8',
    'Class_9'
])
export = pd.concat([pg_test_data['id'], export], axis=1)
export.head()

export.to_csv('submission.csv', index=False)