## Import Libraries & Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve

In [None]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 100)

In [None]:
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
df.head()

## EDA 

Our dataset has 288 column and 200000 row.

In [None]:
df.shape

There are no NULL values in the dataset :

In [None]:
df.info()

In [None]:
df.columns

The data is balanced as the target classes have approximately the same proportion :

In [None]:
df.target.value_counts()

In [None]:
_ , ax = plt.subplots(figsize=[22,6])
sns.countplot(x="target", data=df, ax=ax)

All features are numerical, except the target, and there are no duplicated values :

In [None]:
df.dtypes.value_counts()

In [None]:
df.duplicated().sum()

## Pre-processing

We encode the target as it is categorical :

In [None]:
df.drop('row_id', axis = 1, inplace = True)
X = df.drop('target', axis = 1)
y = df.target

In [None]:
lb = LabelEncoder()
y = lb.fit_transform(y)

We then split our dataset into train and test sets :

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training & Evaluation

We create now a helper function to train and evaluate the given model :

In [None]:
def train_eval(model) :
    model.fit(x_train, y_train)
    ypred = model.predict(x_test)
    
    Test = test.drop('row_id', axis = 1)
    preds =  model.predict(Test)

    print(classification_report(y_test, ypred))
    
    N, train_score, val_score = learning_curve(model, x_train, y_train,
                                              cv=4, 
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
    plt.figure(figsize=(12, 8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()
    
    return preds

We choose to work with XGBClassifier models :

In [None]:
XG = xgboost.XGBClassifier( tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor")

In [None]:
preds = train_eval(XG)

In [None]:
submission['target'] = preds
submission.to_csv('submission.csv', index=False)
submission.head()

## Please if you like this notebook, don't forget to upvote it!