# XGBClassifier

Related notebooks:

1. EDA - https://www.kaggle.com/agorinenko/feb-2022-part1-eda
2. CatBoostClassifier - https://www.kaggle.com/agorinenko/feb-2022-part2-cat-boost-classifier
3. LGBMClassifier - https://www.kaggle.com/agorinenko/feb-2022-part3-lgbm-classifier

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate

from xgboost import XGBClassifier

# Load data from eda notebook

In [None]:
train_df = pd.read_csv('../input/feb-2022-eda/train.csv', index_col="row_id")
test_df = pd.read_csv('../input/feb-2022-eda/test.csv')

Let's separate the target variable and the features.

In [None]:
features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

Encode the target variable.

In [None]:
target_col = 'target_num'

le = LabelEncoder()
train_df[target_col] = le.fit_transform(train_df.target)

train_df.head()

In [None]:
X_train = train_df[features_columns].astype(np.float64)
y_train = train_df[target_col].astype(np.float64)

X_test = test_df[features_columns].astype(np.float64)

# Train model

In [None]:
# TODO: tuning global parameters

In [None]:
%%time

model = XGBClassifier(objective='multi:softmax', 
                      eval_metric='mlogloss', 
                      tree_method='gpu_hist', 
                      predictor='gpu_predictor')
model.fit(X_train, y_train)

# Validate

In [None]:
scores = cross_validate(model, 
                        X_train, y_train, 
                        cv=5,                      
                        scoring=('accuracy'))

In [None]:
print(f'Mean validation accuracy score: {scores["test_score"].mean()}')

# Predict

In [None]:
y_pred = model.predict(X_test)

# Submission

In [None]:
def save_submission(y_pred):  
    y_pred = y_pred.astype(np.int64)
    y_pred_class = le.inverse_transform(y_pred)
    submission = test_df[['row_id']].copy() 
    submission["target"] = y_pred_class
    
    assert len(y_pred_class) == submission.shape[0]
    assert 2 == submission.shape[1]
    
    submission.to_csv("submission.csv",index=False)
    return submission

In [None]:
save_submission(y_pred.flatten()).head()