In [None]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
import optuna
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
df_test_row_id = df_test.row_id 
train_data = df_train.copy()

In [None]:
df_tar = pd.DataFrame(df_train['target'].value_counts())
df_tar['species']=df_tar.index
df_tar = df_tar.reset_index(drop=True)
for i in df_tar.index:
    df_tar['percentage']=df_tar['target']/df_tar['target'].sum()*100
df_tar

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(y='species',x='target',data=df_tar)
plt.xticks(rotation=90)
plt.show()

# Data Cleaning

In [None]:
cols = [e for e in df_test.columns if e not in ('row_id')]

In [None]:
s1 = pd.merge(df_train, df_test, how='inner', on=cols)
s1.head()

In [None]:
dic = {}
for i in range(len(s1)):
    dic[s1.loc[i]['row_id_y']] = s1.loc[i]['row_id_x']

In [None]:
s1 = s1.set_index('row_id_x')
s1_idx = s1.index.to_list()
df_train = df_train.drop(s1_idx)

df_train = df_train.drop_duplicates(subset=cols, keep='first')
df_train = df_train.reset_index(drop=True)

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df_train)):
    df_train.loc[valid_indicies,"kfold"] = fold

In [None]:
useful_features = [c for c in df_train.columns if c not in ("row_id", "target", "kfold")]
df_test = df_test[useful_features]
for col in useful_features:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])


# Hyperparameter Tuning

In [None]:
def xgboost(trial):
    fold = 0
    
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.1, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 25)
    
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    le = preprocessing.LabelEncoder()
    ytrain = le.fit_transform(ytrain)
    yvalid = le.fit_transform(yvalid)
    
    model = XGBClassifier(random_state=fold,
                          tree_method='gpu_hist',
                          gpu_id=0, predictor="gpu_predictor",
                          use_label_encoder =False,
                          n_jobs=-1,
                          n_estimators=1000,
                          eval_metric='rmse',
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          reg_alpha=reg_alpha,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          max_depth=max_depth,
                         )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse


In [None]:
# study = optuna.create_study(direction="minimize")
# study.optimize(xgboost, n_trials=20,gc_after_trial=True)

In [None]:
# study.best_params

# Cross Validation

In [None]:
for fold in range(5):
    
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    le = preprocessing.LabelEncoder()
    ytrain = le.fit_transform(ytrain)
    yvalid = le.fit_transform(yvalid)
    
    model = XGBClassifier(random_state=fold,
                          tree_method='gpu_hist',
                          gpu_id=0, predictor="gpu_predictor",
                          use_label_encoder =False,
                          n_jobs=-1,
                          n_estimators=1000,
                          eval_metric='rmse',
                          learning_rate=0.0958614407371858,
                          reg_lambda= 0.013277441840190538,
                          reg_alpha= 0.0078106631860548935,
                          subsample= 0.5644494238856632,
                          colsample_bytree=0.854776309994251,
                          max_depth=15,
                         )
    model.fit(xtrain, ytrain)
    print(f'fold{fold} ',model.score(xvalid,yvalid))
    
    

# Predictions and Submission

In [None]:
preds = model.predict(df_test)
res = le.inverse_transform(preds)
df = pd.DataFrame({'row_id': df_test_row_id,'target': res})

In [None]:
for e in dic:
    df.loc[df[df['row_id']==e].index.to_list(),'target'] = train_data.loc[train_data[train_data['row_id']==dic[e]].index.tolist()[0],'target']

In [None]:
df.to_csv('Submission.csv',index=False)