In [None]:
import xgboost
import pandas as pd
import numpy as np
import sklearn
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore")

In [None]:
test= pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")           
sample_sub=pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
debugging = True
#debugging = False

if debugging:
    train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")#.sample(frac=0.8, replace=True)
    print(train.shape)
else:
    train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
    print(train.shape)

In [None]:
# duplicates_train = train.duplicated().sum()
# print('Duplicates in train data: {0}'.format(duplicates_train))
# duplicates_test = test.duplicated().sum()
# print('Duplicates in test data: {0}'.format(duplicates_test))
# train.drop_duplicates(keep='first', inplace=True)
# duplicates_train = train.duplicated().sum()
# print('Train data shape:', train.shape)
# print('Duplicates in train data: {0}'.format(duplicates_train))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
reduce_mem_usage(train)
reduce_mem_usage(test)
reduce_mem_usage(sample_sub)

In [None]:
targets = train.target.unique()
target_dict = {t: i for i, t in enumerate(targets)}
target_dict_inv = {v: k for k, v in target_dict.items()}
train["target_map"] = train.target.map(target_dict)
features = [c for c in train.columns if c not in ("target","target_map")]
target="target_map"
df_test=test[features]
y = train[target]
X = train[features]

In [None]:
y.shape

In [None]:
X.shape

In [None]:
if debugging:
    params = dict(xgb_model__n_estimators=[10, 100], xgb_model__max_depth=[1,10], xgb_model__learning_rate =[0.01,0.05])
else:
    params = dict(xgb_model__n_estimators=[10, 50, 100, 150, 200], xgb_model__max_depth=[10,8,6,4,2], xgb_model__learning_rate =[0.01,0.03,0.06])

In [None]:
GPU = True
#GPU = False

if GPU:
    xgb_model = XGBClassifier(num_class=10,
                          metric='multiclass',
                          eval_metric='mlogloss',
                          random_state=911,
                          tree_method='gpu_hist',
                          n_jobs=0,
                          use_label_encoder= False)
else:
    xgb_model = XGBClassifier(num_class=10,
                          metric='multiclass',
                          eval_metric='mlogloss',
                          random_state=911,
                          n_jobs=0,
                          use_label_encoder= False)

In [None]:
inner_cv = StratifiedKFold(n_splits=4, shuffle=True ,random_state=123)
outer_cv = StratifiedKFold(n_splits=4, shuffle=True ,random_state=321)

In [None]:
# inner_cv
gcv = GridSearchCV(xgb_model, 
                   params, 
                   scoring='balanced_accuracy', 
                   cv=inner_cv,
                   iid=False,
                   n_jobs=1, 
                   return_train_score=False)

In [None]:
# outer cv
results = cross_validate(gcv,
                         X,
                         y,
                         scoring='balanced_accuracy',
                         cv=outer_cv,
                         n_jobs=2, 
                         return_train_score=False)

In [None]:
print(pd.DataFrame(results))

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.10)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
X_train.shape[0]-y_train.shape[0]

In [None]:
X_test.shape[0]-y_test.shape[0]

In [None]:
gcv.fit(X_train,y_train)

In [None]:
gcv.best_params_

In [None]:
xgb_model = gcv.best_estimator_

In [None]:
xgb_model.fit(X_train,y_train,
              verbose=True ,
              eval_metric="merror")

In [None]:
y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
scores_xgb_model =cross_val_score(xgb_model,
                       X_train,
                       y_train,
                       scoring='accuracy',
                       cv=StratifiedKFold(n_splits=5, shuffle=True))

In [None]:
print('cross_val_scores=  ',scores_xgb_model.mean())

In [None]:
scores=cross_val_score(gcv,
                       X_train,
                       y_train,
                       scoring='accuracy',
                       cv=StratifiedKFold(n_splits=5, shuffle=True))

In [None]:
print('cross_val_scores=  ',scores.mean())

In [None]:
y_pred=cross_val_predict(xgb_model,
                         X_train,
                         y_train,
                         cv=StratifiedKFold(n_splits=5, shuffle=True))

In [None]:
y_pred

In [None]:
y_pred.shape

In [None]:
conf_mat=confusion_matrix(y_pred,y_train)
conf_mat

In [None]:
sample_sub['target'] = xgb_model.predict(df_test)

In [None]:
sample_sub.shape

In [None]:
# sample_sub['target'] = gcv.best_estimator_.predict(df_test)

In [None]:
sample_sub.head(10)

In [None]:
sample_sub['target']=sample_sub['target'].map(target_dict_inv)

In [None]:
sample_sub

In [None]:
sample_sub.to_csv('submission.csv', index=False)