# CV 기반 스태킹

In [16]:
import pandas as pd

data = pd.read_csv('./FIFA2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18159 entries, 0 to 18158
Data columns (total 83 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      18159 non-null  object 
 1   Age                       18159 non-null  int64  
 2   Nationality               18159 non-null  object 
 3   Overall                   18159 non-null  int64  
 4   Potential                 18159 non-null  int64  
 5   Club                      17918 non-null  object 
 6   Value                     18159 non-null  int64  
 7   Wage                      18159 non-null  int64  
 8   Special                   18159 non-null  int64  
 9   Preferred Foot            18159 non-null  object 
 10  International Reputation  18159 non-null  float64
 11  Weak Foot                 18159 non-null  float64
 12  Skill Moves               18159 non-null  float64
 13  Work Rate                 18159 non-null  object 
 14  Body T

In [2]:
data.index

RangeIndex(start=0, stop=18159, step=1)

In [3]:
map_position = {'FW':0, 'MD':1, 'DF':2, 'GK':3}
col = ['Position simplified']
data[col] = data[col].applymap(map_position.get)
data['Position simplified']

0        0
1        0
2        0
3        3
4        1
        ..
18154    1
18155    0
18156    0
18157    0
18158    1
Name: Position simplified, Length: 18159, dtype: int64

In [4]:
X = data.loc[:, 'Crossing':'GKReflexes']
XX = X.drop('Strength', axis=1)
y = data.loc[:, 'Position simplified']
xy = pd.concat([X, y], axis=1)
xxy = pd.concat([XX, y], axis=1)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [6]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
def get_model_train_eval(model, X_train=None, X_test=None, y_train=None, y_test=None):
    model.fit(X_train, y_train)
    print('{} Test Accuracy: {}%'.format(model, round(model.score(X_test, y_test)*100, 2)))


    pred_model = model.predict(X_test)
    print('{} report:{}\n'.format(model.__class__.__name__, classification_report(y_test, pred_model)))

In [7]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import numpy as np


rf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 6, min_samples_split= 8, n_estimators= 500)
xgb = XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      eval_metric='mlogloss',
                                                      gamma=0.3, gpu_id=-1,
                                                      importance_type='gain',
                                                      interaction_constraints='',
                                                      learning_rate=0.05,
                                                      max_delta_step=0,
                                                      max_depth=9,
                                                      min_child_weight=1,
                                                      missing=np.nan,
                                                      monotone_constraints='()',
                                                      n_estimators=1000,
                                                      n_jobs=8,
                                                      num_parallel_tree=1,
                                                      objective='multi:softprob',
                                                      random_state=0,
                                                      reg_alpha=0, reg_lambda=1,
                                                      scale_pos_weight=None,
                                                      subsample=1,
                                                      tree_method='exact',
                                                      validate_parameters=1,
                                                      verbosity=None)

lgbm = LGBMClassifier(learning_rate= 0.04, n_estimators= 1000, min_child_weight=30, objective='multiclass', num_iterations=150)
gbm = GradientBoostingClassifier()
svm = SVC(kernel='rbf', C=10.0, gamma=0.01, random_state=0)

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__ , 'model 시작')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t 폴드 세트: ', folder_counter, '시작')
        X_tr = X_train_n.iloc[train_index]
        y_tr = y_train_n.iloc[train_index]
        X_te = X_train_n.iloc[valid_index]

        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

In [9]:
rf_train, rf_test = get_stacking_base_datasets(rf, X_train, y_train, X_test, 3)
xgb_train, xgb_test = get_stacking_base_datasets(xgb, X_train, y_train, X_test, 3)
lgbm_train, lgbm_test = get_stacking_base_datasets(lgbm,  X_train, y_train, X_test, 3)
gbm_train, gbm_test = get_stacking_base_datasets(gbm, X_train, y_train, X_test, 3)
svm_train, svm_test = get_stacking_base_datasets(svm, X_train, y_train, X_test, 3)

RandomForestClassifier model 시작
	 폴드 세트:  0 시작
	 폴드 세트:  1 시작
	 폴드 세트:  2 시작
XGBClassifier model 시작
	 폴드 세트:  0 시작
	 폴드 세트:  1 시작
	 폴드 세트:  2 시작
LGBMClassifier model 시작
	 폴드 세트:  0 시작
	 폴드 세트:  1 시작
	 폴드 세트:  2 시작
GradientBoostingClassifier model 시작
	 폴드 세트:  0 시작
	 폴드 세트:  1 시작
	 폴드 세트:  2 시작
SVC model 시작
	 폴드 세트:  0 시작
	 폴드 세트:  1 시작
	 폴드 세트:  2 시작


In [10]:
Stack1_final_X_train = np.concatenate((xgb_train, lgbm_train, gbm_train, svm_train), axis=1)
Stack2_final_X_train = np.concatenate((rf_train, lgbm_train, gbm_train, svm_train),  axis=1)
Stack3_final_X_train = np.concatenate((rf_train, xgb_train, gbm_train, svm_train),  axis=1)
Stack4_final_X_train = np.concatenate((rf_train, xgb_train, lgbm_train, svm_train),  axis=1)
Stack5_final_X_train = np.concatenate((rf_train, xgb_train, lgbm_train, gbm_train),  axis=1)

In [11]:
Stack1_final_X_test = np.concatenate((xgb_test, lgbm_test, gbm_test, svm_test), axis=1)
Stack2_final_X_test = np.concatenate((rf_test, lgbm_test, gbm_test, svm_test), axis=1)
Stack3_final_X_test = np.concatenate((rf_test, xgb_test, gbm_test, svm_test), axis=1)
Stack4_final_X_test = np.concatenate((rf_test, xgb_test, lgbm_test, svm_test), axis=1)
Stack5_final_X_test = np.concatenate((rf_test, xgb_test, lgbm_test, gbm_test), axis=1)

In [15]:
from sklearn.metrics import accuracy_score

rf.fit(Stack1_final_X_train, y_train)
xgb.fit(Stack2_final_X_train, y_train)
lgbm.fit(Stack3_final_X_train, y_train)
gbm.fit(Stack4_final_X_train, y_train)
svm.fit(Stack5_final_X_train, y_train)

stack1_final = rf.predict(Stack1_final_X_test)
stack2_final = xgb.predict(Stack2_final_X_test)
stack3_final = lgbm.predict(Stack3_final_X_test)
stack4_final = gbm.predict(Stack4_final_X_test)
stack5_final = svm.predict(Stack5_final_X_test)

#랜덤포레스트
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack1_final)))
#XGBoost
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack2_final)))
#LGBM
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack3_final)))
#GBM
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack4_final)))
#SVM
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack5_final)))

최종 메타 모델의 예측 정확도: 0.8835
최종 메타 모델의 예측 정확도: 0.8852
최종 메타 모델의 예측 정확도: 0.8791
최종 메타 모델의 예측 정확도: 0.8841
최종 메타 모델의 예측 정확도: 0.8844


#   배깅

## 랜덤포레스트

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

base_model_rf= make_pipeline(
    StandardScaler(),
    RandomForestClassifier(max_depth= 20, min_samples_leaf= 6, min_samples_split= 8, n_estimators= 500)
)

bagging_model = BaggingClassifier(base_model_rf, n_estimators=1000, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model_rf,
    X = X,
    y = y,
    cv= 5
)

print(round(cross_val['test_score'].mean(), 4))

0.865


## XGB

In [19]:
base_model_xgb= make_pipeline(
    StandardScaler(),
    XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      eval_metric='mlogloss',
                                                      gamma=0.3, gpu_id=-1,
                                                      importance_type='gain',
                                                      interaction_constraints='',
                                                      learning_rate=0.05,
                                                      max_delta_step=0,
                                                      max_depth=9,
                                                      min_child_weight=1,
                                                      missing=np.nan,
                                                      monotone_constraints='()',
                                                      n_estimators=1000,
                                                      n_jobs=8,
                                                      num_parallel_tree=1,
                                                      objective='multi:softprob',
                                                      random_state=0,
                                                      reg_alpha=0, reg_lambda=1,
                                                      scale_pos_weight=None,
                                                      subsample=1,
                                                      tree_method='exact',
                                                      validate_parameters=1,
                                                      verbosity=None)
)

bagging_model = BaggingClassifier(base_model_xgb, n_estimators=1000, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model_xgb,
    X = X,
    y = y,
    cv= 5
)

print(round(cross_val['test_score'].mean(), 4))

0.8657


## LGBM

In [20]:
base_model_lgbm= make_pipeline(
    StandardScaler(),
    LGBMClassifier(learning_rate= 0.04, n_estimators= 1000, min_child_weight=30, objective='multiclass', num_iterations=150)
)

bagging_model = BaggingClassifier(base_model_lgbm, n_estimators=1000, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model_lgbm,
    X = X,
    y = y,
    cv= 5
)

print(round(cross_val['test_score'].mean(), 4))

0.8676


## GBM

In [21]:
base_model_gbm = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier()
)

bagging_model = BaggingClassifier(base_model_gbm, n_estimators=1000, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model_gbm,
    X = X,
    y = y,
    cv= 5
)

print(round(cross_val['test_score'].mean(), 4))

0.859


## SVM

In [22]:
base_model_svm = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=10.0, gamma=0.01, random_state=0)
)

bagging_model = BaggingClassifier(base_model_svm, n_estimators=1000, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model_svm,
    X = X,
    y = y,
    cv= 5
)

print(round(cross_val['test_score'].mean(), 4))

0.8806
