reference model : https://www.kaggle.com/hadeux/tps-nov-xgboost-baseline

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

import seaborn as sns
import sys
import csv
import datetime
import operator
import joblib
import warnings

# Import Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")

In [None]:
train.isnull().sum()[train.isnull().sum() != 0]

In [None]:
train_df = train.drop(['id'], axis = 1)
test_df = test.drop(['id'], axis = 1)

**XGBoost hyperparameter**



    booster='gbtree'  1. 트리,회귀(gblinear) 트리가 항상  
                         Tree, a gblinear tree is always
                      2. 더 좋은 성능을 내기 때문에 수정할 필요없다고한다. 
                         It is said that there is no need to modify it because it gives better performance.
    
    silent=True       1. running message output X.
                      2. 모델이 적합되는 과정을 이해하기위해선 False으로한다.
                      To understand how the model is fitted, set it to False.
    
    min_child_weight=10    1. 값이 높아지면 under-fitting 되는 경우가 있다. CV를 통해 튜닝되어야 한다.
                              Higher values ​​may lead to under-fitting. It should be tuned via CV.
    
    max_depth=8      1. 트리의 최대 깊이를 정의함.
                        Defines the maximum depth of the tree.
    
                     2. 루트에서 가장 긴 노드의 거리.
                        Distance of longest node from root.
                     3. 8이면 중요변수에서 결론까지 변수가 9개거친다.
                        If the value is 8, 9 variables pass from the important variable to the conclusion.
                     4. Typical Value is 3-10. 
    
    gamma =0         1. 노드가 split 되기 위한 loss function의 값이 감소하는 최소값을 정의한다. 
                        gamma 값이 높아질 수록 알고리즘은 보수적으로 변하고, loss function의 정의에 
                        따라 적정값이 달라지기때문에 반드시 튜닝.
                        Defines the minimum value at which the value of the loss function for the node to split is decreased. As the gamma value increases, the algorithm changes conservatively, and the appropriate value changes according to the definition of the loss function, 
                        so it must be tunning.
    
    nthread =4       1. XGBoost를 실행하기 위한 병렬처리(쓰레드) 갯수. 'n_jobs' 를 사용해라.
                        The number of parallel processing (threads) to execute XGBoost. Use 'n_jobs' .
    
    colsample_bytree=0.8    1. 트리를 생성할때 훈련 데이터에서 변수를 샘플링해주는 비율. 보통 0.6~0.9
                               The rate at which variables are sampled from the training data when creating the tree. Usually 0.6-0.9
    
    colsample_bylevel=0.9   1. 트리의 레벨별로 훈련 데이터의 변수를 샘플링해주는 비율. 보통0.6~0.9
                               The rate at which the variables in the training data are sampled for each level of the tree. Usually 0.6~0.9
    
    n_estimators =(int)     1. 부스트트리의 양 amount of boost tree
                            2. 트리의 갯수. number of trees.
     
    objective = 'reg:linear','binary:logistic','multi:softmax','multi:softprob'
                 1.  regression case 'reg',
                 2.  In case of binary classification 'binary',
                 3.  Multiple classification case 'multi',
                 4.  When returning a classified class 'softmax',
                 5.  When returning the probability of belonging to each class 'softprob'*
    
    random_state =   1. random number seed.
                     2. like seed.



# Model

# XGBoost Modeling

In [None]:
# data segmentation
X = train_df.drop('target', axis=1)
y = train_df['target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0) # train, valid 8:2 분할

In [None]:
xgboost_model= xgb.XGBClassifier(max_depth = 8,
                                 learning_rate = 0.005,
                                 n_estimators = 10000,
                                 objective = 'binary:logistic',
                                 tree_method = 'gpu_hist',
                                 booster = 'gbtree',
                                 gamma = 0.64,
                                 max_delta_step = 3,
                                 min_child_weight = 7,
                                 subsample = 0.7,
                                 colsample_bytree = 0.8,
                                 n_jobs = -1
                                 )

In [None]:
start = datetime.datetime.now()
xgb = xgboost_model.fit(X_train,
                       y_train,
                       eval_set = [(X_train, y_train), (X_valid, y_valid)], 
                       eval_metric = 'auc',
                       early_stopping_rounds = 15,
                       verbose = True)
end = datetime.datetime.now()
end-start

In [None]:
y_pred = xgboost_model.predict(X_valid)

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import cohen_kappa_score
from collections import OrderedDict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import norm, skew, probplot
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import StratifiedKFold

def classifier_eval(y_valid , y_pred) :
  print('정확도(accuracy_score) : ', accuracy_score(y_valid, y_pred))
  print('정밀도(precision_score) : ', precision_score(y_valid, y_pred))
  print('재현율(recall_score) : ', recall_score(y_valid, y_pred))
  print('F1 : ', f1_score(y_valid, y_pred))
  print('AUC : ', roc_auc_score(y_valid, y_pred))

classifier_eval(y_valid, y_pred)

In [None]:
x = np.array([accuracy_score(y_valid, y_pred),
              precision_score(y_valid, y_pred),
              recall_score(y_valid, y_pred),
              f1_score(y_valid, y_pred),
              roc_auc_score(y_valid, y_pred)])

x

**result visualization**

In [None]:
label = ['accuracy', 'precision', 'recall_score', 'f1_score', 'roc_auc']

index = np.arange(len(label))


plt.bar(index, x, width=0.5)
plt.title('evaluation index', fontsize=20)
plt.ylabel('%', fontsize=18)
plt.xticks(index, label, fontsize=15,rotation=90)    # X축의 범위: [xmin, xmax]
plt.ylim([0, 1])     # Y축의 범위: [ymin, ymax]
plt.show()

**Features importance**

In [None]:
fi_vals = xgb.get_booster().get_score(importance_type = 'weight')
fi_dict = {X_train.columns[i]:float(fi_vals.get('f'+str(i),0.)) for i in range(len(X_train.columns))}
feature_importance_ = sorted(fi_dict.items(), key=operator.itemgetter(1), reverse=True)
feature_importance_result = OrderedDict(feature_importance_)

importance = pd.DataFrame(feature_importance_)
importance.columns = ['feature','weight']
importance.head(10)

# Submission

In [None]:
xgboost_prediction = xgboost_model.predict(test_df)

In [None]:
submission_XGBoost = pd.DataFrame({'id':test['id'], 'target':xgboost_prediction})
submission_XGBoost.head()

In [None]:
submission_XGBoost.to_csv('./submission.csv', index=False)