Reference ---[https://xgboost.readthedocs.io/en/latest/gpu/index.html](http://xgboost.readthedocs.io/en/latest/gpu/index.html)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
### Libraries
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import shap
from scipy.stats import probplot, kurtosis, skew, gmean
from sklearn.mixture import GaussianMixture

In [None]:
### Config
train_ = '../input/tabular-playground-series-feb-2021/train.csv'
test_ = '../input/tabular-playground-series-feb-2021/test.csv'
sub_ = '../input/tabular-playground-series-feb-2021/sample_submission.csv'

In [None]:
df_train= pd.read_csv(train_)
df_train.info()

In [None]:
df_train.head(10)

In [None]:
def plot_target(target):
    
    print(f'Target feature {target} Statistical Analysis\n{"-" * 42}')
        
    print(f'Mean: {df_train[target].mean():.4}  -  Median: {df_train[target].median():.4}  -  Std: {df_train[target].std():.4}')
    print(f'Min: {df_train[target].min():.4}  -  25%: {df_train[target].quantile(0.25):.4}  -  50%: {df_train[target].quantile(0.5):.4}  -  75%: {df_train[target].quantile(0.75):.4}  -  Max: {df_train[target].max():.4}')
    print(f'Skew: {df_train[target].skew():.4}  -  Kurtosis: {df_train[target].kurtosis():.4}')
    missing_values_count = df_train[df_train[target].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4}%)')

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(24, 12), dpi=100)

    sns.distplot(df_train[target], label=target, ax=axes[0][0])
    axes[0][0].axvline(df_train[target].mean(), label='Target Mean', color='r', linewidth=2, linestyle='--')
    axes[0][0].axvline(df_train[target].median(), label='Target Median', color='b', linewidth=2, linestyle='--')
    probplot(df_train[target], plot=axes[0][1])
    
    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(df_train[target].values.reshape(-1, 1))
    df_train[f'{target}_class'] = gmm.predict(df_train[target].values.reshape(-1, 1))
    
    sns.distplot(df_train[target], label=target, ax=axes[1][0])
    sns.distplot(df_train[df_train[f'{target}_class'] == 0][target], label='Component 1', ax=axes[1][1])
    sns.distplot(df_train[df_train[f'{target}_class'] == 1][target], label='Component 2', ax=axes[1][1])
    
    axes[0][0].legend(prop={'size': 15})
    axes[1][1].legend(prop={'size': 15})
    
    for i in range(2):
        for j in range(2):
            axes[i][j].tick_params(axis='x', labelsize=12)
            axes[i][j].tick_params(axis='y', labelsize=12)
            axes[i][j].set_xlabel('')
            axes[i][j].set_ylabel('')
    axes[0][0].set_title(f'{target} Distribution in Training Set', fontsize=15, pad=12)
    axes[0][1].set_title(f'{target} Probability Plot', fontsize=15, pad=12)
    axes[1][0].set_title(f'{target} Distribution Before GMM', fontsize=15, pad=12)
    axes[1][1].set_title(f'{target} Distribution After GMM', fontsize=15, pad=12)
    plt.show()
    

plot_target('target')

In [None]:
test_df = pd.read_csv(test_)
test_df

In [None]:
sns.set_style("whitegrid")
sns.set_palette('Purples')

In [None]:
fig , axes = plt.subplots(2,5,figsize = (18,12),constrained_layout = True)
sns.countplot(ax=axes[0,0],x='cat0',data=df_train )
sns.countplot(ax=axes[0,1],x='cat1',data=df_train)
sns.countplot(ax=axes[0,2],x='cat2',data=df_train)
sns.countplot(ax=axes[0,3],x='cat3',data=df_train)
sns.countplot(ax=axes[0,4],x='cat4',data=df_train)
sns.countplot(ax=axes[1,0],x='cat5',data=df_train)
sns.countplot(ax=axes[1,1],x='cat6',data=df_train)
sns.countplot(ax=axes[1,2],x='cat7',data=df_train)
sns.countplot(ax=axes[1,3],x='cat8',data=df_train)
sns.countplot(ax=axes[1,4],x='cat9',data=df_train)
plt.suptitle('Count Plot for Each Columns')
plt.show()

In [None]:
df_train['cat3'].value_counts()

In [None]:
df_train['cat4'].value_counts()

In [None]:
df_train['cat6'].value_counts()

In [None]:
df_train['cat7'].value_counts()

In [None]:
df_train['cat8'].value_counts()

In [None]:
df_train['cat9'].value_counts()

In [None]:
df_train['cat6'].replace('G','A',inplace = True)

In [None]:
def preprocessing(column,target,result):
    df_train[column].replace(target,result,inplace = True)
preprocessing('cat6','H','A')
preprocessing('cat6','E','A')
preprocessing('cat6','I','A')
preprocessing('cat6','D','A')
preprocessing('cat3','B','C')
preprocessing('cat7','I','E')
preprocessing('cat7','C','E')
preprocessing('cat7','A','E')
preprocessing('cat7','F','E')
preprocessing('cat8','B','C')
preprocessing('cat9','C','F')
preprocessing('cat9','D','F')
preprocessing('cat9','E','F')

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [None]:
lb = LabelEncoder()
df_train['cat0'] = lb.fit_transform(df_train['cat0'])
df_train['cat1'] = lb.fit_transform(df_train['cat1'])
df_train['cat2'] = lb.fit_transform(df_train['cat2'])
df_train['cat3'] = lb.fit_transform(df_train['cat3'])
df_train['cat4'] = lb.fit_transform(df_train['cat4'])
df_train['cat5'] = lb.fit_transform(df_train['cat5'])
df_train['cat6'] = lb.fit_transform(df_train['cat6'])
df_train['cat7'] = lb.fit_transform(df_train['cat7'])
df_train['cat8'] = lb.fit_transform(df_train['cat8'])
df_train['cat9'] = lb.fit_transform(df_train['cat9'])

In [None]:
print(df_train['cat0'].unique(),df_train['cat1'].unique(),df_train['cat2'].unique(),df_train['cat3'].unique(),df_train['cat4'].unique())

In [None]:
df_train.columns

In [None]:
data = df_train[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13']]

In [None]:
sb = StandardScaler()
data = sb.fit_transform(data)

In [None]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
X_train,X_test,y_train,y_test = train_test_split(data,df_train['target'],test_size = 0.2 , random_state = 2021)

In [None]:
print(X_train.shape,y_train.shape)

In [None]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

In [None]:
model_parameters =  {
            'learning_rate': 0.002,
            'colsample_bytree': 0.6, 
            'colsample_bylevel': 0.6,
            'colsample_bynode': 0.6,
            'sumbsample': 0.8,
            'max_depth': 14,
            'gamma': 0,
            'min_child_weight': 200,
            'lambda': 0,
            'alpha': 0,
            'objective': 'reg:squarederror',
            'seed': None,
            'boosting_type': 'gbtree',
            'tree_method': 'gpu_hist',
            'silent': True,
            'verbose': 1,
            'n_jobs': -1,
        }
model = xgb.train(model_parameters, d_train, 25000, evals = [(d_test, "test")], verbose_eval=100, early_stopping_rounds=200)

In [None]:
lb = LabelEncoder()
test_df['cat0'] = lb.fit_transform(test_df['cat0'])
test_df['cat1'] = lb.fit_transform(test_df['cat1'])
test_df['cat2'] = lb.fit_transform(test_df['cat2'])
test_df['cat3'] = lb.fit_transform(test_df['cat3'])
test_df['cat4'] = lb.fit_transform(test_df['cat4'])
test_df['cat5'] = lb.fit_transform(test_df['cat5'])
test_df['cat6'] = lb.fit_transform(test_df['cat6'])
test_df['cat7'] = lb.fit_transform(test_df['cat7'])
test_df['cat8'] = lb.fit_transform(test_df['cat8'])
test_df['cat9'] = lb.fit_transform(test_df['cat9'])

In [None]:
x__test = test_df[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13']]

In [None]:
x__test = sb.fit_transform(x__test)

In [None]:
x__test

In [None]:
d__test = xgb.DMatrix(x__test)

In [None]:
target = model.predict(d__test)

In [None]:
type(target)

In [None]:
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['target'] = target

In [None]:
submission.set_index('id',inplace = True)

In [None]:
submission.shape

In [None]:
submission.to_csv('mygood.csv')