In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [13]:
data = pd.read_csv('./FIFA2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18159 entries, 0 to 18158
Data columns (total 83 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      18159 non-null  object 
 1   Age                       18159 non-null  int64  
 2   Nationality               18159 non-null  object 
 3   Overall                   18159 non-null  int64  
 4   Potential                 18159 non-null  int64  
 5   Club                      17918 non-null  object 
 6   Value                     18159 non-null  int64  
 7   Wage                      18159 non-null  int64  
 8   Special                   18159 non-null  int64  
 9   Preferred Foot            18159 non-null  object 
 10  International Reputation  18159 non-null  float64
 11  Weak Foot                 18159 non-null  float64
 12  Skill Moves               18159 non-null  float64
 13  Work Rate                 18159 non-null  object 
 14  Body T

In [14]:
map_position = {'FW':0, 'MD':1, 'DF':2, 'GK':3}
col = ['Position simplified']
data[col] = data[col].applymap(map_position.get)
data['Position simplified']

X = data.loc[:, 'Crossing':'GKReflexes']
X.drop(['Reactions', 'Jumping'], axis=1)
y = data.loc[:, 'Position simplified']
xy = pd.concat([X, y], axis=1)

In [11]:
xy

Unnamed: 0,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Position simplified
0,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,0
1,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,0
2,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,0
3,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,3
4,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18154,34.0,38.0,40.0,49.0,25.0,42.0,30.0,34.0,45.0,43.0,...,45.0,40.0,48.0,47.0,10.0,13.0,7.0,8.0,9.0,1
18155,23.0,52.0,52.0,43.0,36.0,39.0,32.0,20.0,25.0,40.0,...,42.0,22.0,15.0,19.0,10.0,9.0,9.0,5.0,12.0,0
18156,25.0,40.0,46.0,38.0,38.0,45.0,38.0,27.0,28.0,44.0,...,41.0,32.0,13.0,11.0,6.0,5.0,10.0,6.0,13.0,0
18157,44.0,50.0,39.0,42.0,40.0,51.0,34.0,32.0,32.0,52.0,...,46.0,20.0,25.0,27.0,14.0,6.0,14.0,8.0,9.0,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [6]:
def get_model_train_eval(model, model_name, X_train=None, X_test=None, y_train=None, y_test=None):
    model.fit(X_train, y_train)
    print('{} Test Accuracy: {}%'.format(model, round(model.score(X_test, y_test)*100, 2)))


    pred_model = model.predict(X_test)
    print('{} report:{}\n'.format(model_name, classification_report(y_test, pred_model)))

In [7]:
from sklearn.svm import SVC

linear_svc = SVC(kernel='linear')
polynoimal_svc = SVC(kernel='poly')
rbf_svc = SVC(kernel='rbf')

get_model_train_eval(linear_svc, 'Linear SVC', X_train, X_test, y_train, y_test)
get_model_train_eval(polynoimal_svc, 'Ploy SVC', X_train, X_test, y_train, y_test)
get_model_train_eval(rbf_svc, 'RBF SVC', X_train, X_test, y_train, y_test)

SVC(kernel='linear') Test Accuracy: 87.13%
Linear SVC report:              precision    recall  f1-score   support

           0       0.85      0.76      0.80      1025
           1       0.81      0.86      0.84      2052
           2       0.92      0.90      0.91      1760
           3       0.99      1.00      0.99       611

    accuracy                           0.87      5448
   macro avg       0.89      0.88      0.89      5448
weighted avg       0.87      0.87      0.87      5448


SVC(kernel='poly') Test Accuracy: 87.56%
Ploy SVC report:              precision    recall  f1-score   support

           0       0.85      0.73      0.78      1025
           1       0.81      0.88      0.84      2052
           2       0.93      0.92      0.92      1760
           3       1.00      1.00      1.00       611

    accuracy                           0.88      5448
   macro avg       0.90      0.88      0.89      5448
weighted avg       0.88      0.88      0.87      5448


SVC() Test

In [17]:
import multiprocessing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(kernel='rbf'))
])

param_grid = [{
    'model__gamma':['scale', 'auto'],
    'model__C':[1.0, 0.1, 0.01]
}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('model',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.0

In [18]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [19]:
model = gs.best_estimator_
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [20]:
get_model_train_eval(model, "rbf", X_train, X_test, y_train, y_test)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False) Test Accuracy: 87.98%
rbf report:              precision    recall  f1-score   support

           0       0.87      0.73      0.79      1025
           1       0.81      0.89      0.85      2052
           2       0.93      0.92      0.92      1760
           3       1.00      1.00      1.00       611

    accuracy                           0.88      5448
   macro avg       0.90      0.88      0.89      5448
weighted avg       0.88      0.88      0.88      5448




In [9]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)
get_model_train_eval(lgbm, "LGMB", X_train, X_test, y_train, y_test)

LGBMClassifier(boost_from_average=False, n_estimators=1000, num_leaves=64) Test Accuracy: 88.18%
LGMB report:              precision    recall  f1-score   support

           0       0.85      0.77      0.80      1025
           1       0.83      0.87      0.85      2052
           2       0.93      0.92      0.92      1760
           3       1.00      1.00      1.00       611

    accuracy                           0.88      5448
   macro avg       0.90      0.89      0.89      5448
weighted avg       0.88      0.88      0.88      5448




In [10]:
from pycaret.classification import *

data = pd.read_csv('./FIFA2.csv')
XX = data.loc[:, 'Crossing':'GKReflexes']
XX.drop(['Reactions', 'Jumping'], axis=1, inplace=True)
yy = data.loc[:, 'Position simplified']

data_auto = pd.concat([XX, yy], axis=1)
clf = setup(data=data_auto, target='Position simplified', train_size=0.7, session_id=10)

Unnamed: 0,Description,Value
0,session_id,10
1,Target,Position simplified
2,Target Type,Multiclass
3,Label Encoded,"DF: 0, FW: 1, GK: 2, MD: 3"
4,Original Data,"(18159, 33)"
5,Missing Values,False
6,Numeric Features,32
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [11]:
best4models = compare_models(sort='Accuracy', n_select=4, fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8834,0.9705,0.8902,0.8833,0.883,0.8335,0.8338,1.07
catboost,CatBoost Classifier,0.8808,0.9715,0.8878,0.8807,0.8803,0.8298,0.8301,19.4433
xgboost,Extreme Gradient Boosting,0.8796,0.9701,0.8869,0.8794,0.8791,0.828,0.8283,4.24
gbc,Gradient Boosting Classifier,0.8793,0.9705,0.8881,0.8792,0.879,0.8278,0.828,7.9467
et,Extra Trees Classifier,0.8788,0.9695,0.8856,0.8788,0.8782,0.8267,0.8271,0.7967
rf,Random Forest Classifier,0.878,0.969,0.8857,0.8777,0.8773,0.8257,0.826,0.7033
lr,Logistic Regression,0.8759,0.9691,0.8862,0.8761,0.8757,0.823,0.8232,4.02
lda,Linear Discriminant Analysis,0.8662,0.9645,0.8812,0.8663,0.8662,0.8096,0.8096,0.06
ridge,Ridge Classifier,0.8611,0.0,0.8735,0.8612,0.8608,0.8017,0.8019,0.0333
knn,K Neighbors Classifier,0.8609,0.9484,0.8723,0.8618,0.8609,0.8014,0.8017,2.8533


In [12]:
stracker = stack_models(estimator_list=best4models[1:], meta_model=best4models[0], fold=3)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8832,0.9709,0.8876,0.8831,0.8824,0.8329,0.8334
1,0.8794,0.9698,0.884,0.8795,0.8785,0.8274,0.828
2,0.8756,0.9688,0.8806,0.8761,0.875,0.822,0.8226
Mean,0.8794,0.9698,0.8841,0.8796,0.8786,0.8274,0.828
SD,0.0031,0.0009,0.0029,0.0029,0.003,0.0044,0.0044


In [None]:
xy.info()

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LGBMClassifier())
])

param_grid = [{
    'model__n_estimators':[300, 400, 500],
    'model__learning_rate':[0.3, 0.4, 0.5],
    'model__maxdepth':[3,4,5,6]
}]

grid_model_lgb = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lgb.fit(X_train, y_train)
grid_model_lgb.best_params_
get_model_train_eval(grid_model_lgb, X_train, X_test, y_train, y_test)

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC())
])

grid_model_svm = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_svm.fit(X, y)

In [None]:
grid_model_svm.best_estimator

# CV 세트 기반 스태킹

In [3]:
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [16]:
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)



LGBMClassifier()

In [17]:
rf_pred = rf.predict(X_test)
dt_pred = dt.predict(X_test)
xgb_pred = xgb.predict(X_test)
lgbm_pred = lgbm.predict(X_test)

In [19]:
import numpy as np

pred = np.array([dt_pred, rf_pred, xgb_pred, lgbm_pred])
print(pred.shape)

pred = np.transpose(pred)
print(pred.shape)

(4, 5448)
(5448, 4)


In [21]:
from sklearn.metrics import accuracy_score


rbf_svc_final = SVC(kernel='rbf')
rbf_svc_final.fit(pred, y_test)
final = rbf_svc_final.predict(pred)

print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, final)))

최종 메타 모델의 예측 정확도: 0.8811


In [23]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__ , 'model 시작')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t 폴드 세트: ', folder_counter, '시작')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]

        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

In [25]:
dt_train, dt_test = get_stacking_base_datasets(dt, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf, X_train, y_train, X_test, 7)
xgb_train, xgb_test = get_stacking_base_datasets(xgb, X_train, y_train, X_test, 7)
lgbm_train, lgbm_test = get_stacking_base_datasets(lgbm,  X_train, y_train, X_test, 7)


DecisionTreeClassifier model 시작
	 폴드 세트:  0 시작


KeyError: "None of [Int64Index([ 1816,  1817,  1818,  1819,  1820,  1821,  1822,  1823,  1824,\n             1825,\n            ...\n            12701, 12702, 12703, 12704, 12705, 12706, 12707, 12708, 12709,\n            12710],\n           dtype='int64', length=10895)] are in the [columns]"

In [None]:
Stack_final_X_train = np.concatenate((rf_train, xgb_train, lgbm_train, , dt_train), axis=1)
Stack_final_X_test = np.concatenate((rf_test,  xgb_test, lgbm_test, dt_test), axis=1)
print('원본 학습 피처 데이터 Shape: ', X_train.shape, '원본 테스트 피처 Shape: ', X_test.shape)
print('스태킹 학습 피터 데이터 SHape: ', Stack_final_X_train.shape, '스태킹 테스트 피터 데이터: ', Stack_final_X_train.shape)

In [None]:
svm_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack_final)))

# 배깅

In [34]:

from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

base_model = make_pipeline(
    StandardScaler(),
    LGBMClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [39]:
from sklearn.model_selection import cross_validate

cross_val = cross_validate(
    estimator = base_model,
    X = X,
    y = y,
    cv= 5
)

print(cross_val['test_score'].mean())

0.8673938442462543


In [40]:
base_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf')
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator = base_model,
    X = X,
    y = y,
    cv= 5
)

print(cross_val['test_score'].mean())

0.8798942082192379
