In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, f1_score,
    cohen_kappa_score,classification_report,
)

In [45]:
data_path = 'imputed_data.xlsx'
df = pd.read_excel(data_path)
df = df.drop(columns=["player", "nation", "squad", "comp", "born", "pos"])

In [None]:
target = 'pos4'
X = df.drop(columns=[target])
y = df[target].astype(str)
print(X.shape[1])


In [38]:
le = LabelEncoder()
y = le.fit_transform(y)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [40]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [41]:
feature_names = X_train.columns 
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
X_test_scaled_df  = pd.DataFrame(X_test_scaled,  columns=feature_names, index=X_test.index)

MULTINOMIAL LOGISTIC REGRESSION

In [42]:
logreg = LogisticRegression(multi_class="multinomial",random_state=42)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [500]
}

grid_logreg = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)


In [43]:
grid_logreg.fit(X_train_scaled_df, y_train)
best_logreg_params = grid_logreg.best_params_
print("En iyi parametreler:", best_logreg_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
En iyi parametreler: {'C': 0.01, 'max_iter': 500, 'solver': 'lbfgs'}




In [44]:
y_tr_pred = grid_logreg.predict(X_train_scaled_df)
y_te_pred = grid_logreg.predict(X_test_scaled_df)

metrics = {
    'train_acc':  accuracy_score(y_train, y_tr_pred),
    'test_acc':   accuracy_score(y_test,  y_te_pred),
    'train_f1':   f1_score(y_train, y_tr_pred, average='weighted'),
    'test_f1':    f1_score(y_test,  y_te_pred, average='weighted'),
    'train_kappa':cohen_kappa_score(y_train, y_tr_pred),
    'test_kappa': cohen_kappa_score(y_test,  y_te_pred)
}
print("\n--- Logistic Regression (GridSearch) Performans ---")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")
print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))


--- Logistic Regression (GridSearch) Performans ---
train_acc: 0.8033
test_acc: 0.7732
train_f1: 0.8033
test_f1: 0.7741
train_kappa: 0.7176
test_kappa: 0.6743

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.84      0.82      0.83       193
          FW       0.76      0.73      0.74       136
          GK       1.00      1.00      1.00        33
          MF       0.68      0.72      0.70       176

    accuracy                           0.77       538
   macro avg       0.82      0.82      0.82       538
weighted avg       0.78      0.77      0.77       538



MULTINOMIAL LOGISTIC REGRESSION RFECV

In [32]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv_logreg = RFECV(
    estimator=LogisticRegression(multi_class='multinomial', max_iter=500, solver='lbfgs', random_state=42),
    step=1,
    cv=cv,
    scoring="f1_weighted",
    min_features_to_select=10,
    n_jobs=-1
)
rfecv_logreg.fit(X_train_scaled_df, y_train)

print(f"\nRFECV ile optimum özellik sayısı: {rfecv_logreg.n_features_}")
selected_logreg = X_train_scaled_df.columns[rfecv_logreg.support_]
print("RFECV seçtiği değişkenler:", list(selected_logreg))


RFECV ile optimum özellik sayısı: 36
RFECV seçtiği değişkenler: ['age', 'PlayingContribution', 'ShotEfficiencyIndex', 'PenaltyImpactScore', 'ShotTypePreference', 'FreeKickProfile', 'TotalPassingVolume', 'ShortvsLongPassProfile', 'PassingAccuracy', 'PassProfileVariation', 'OffensivePlaymakerScore', 'ChainImpact', 'OpenPlayvsPieceCreativity', 'DribbleandFoulCreation', 'CounterAttackInitiationScore', 'DribbleActivityIndex', 'DribbleSuccessScore', 'BallProgressionIndex', 'RiskCarryingProfile', 'CarryEfficiencyScore', 'OverallTouchActivity', 'TouchLocationIndex', 'FinalThirdPresence', 'DefensiveActivityIndex', 'DefensiveZoneRecoveryProfile', 'DefensiveBlockingProfile', 'onetooneDefensiveVulnerability', 'PressingDefenderIndex', 'DefensiveReliabilityIndex', 'DisciplinaryAgressionIndex', 'FouledandOffsideTendency', 'PenaltyLiabilityIndex', 'OffsidevsPenaltyOutcome', 'FairPlayProfile', 'AerialDuelActivity', 'AerialSuccessIndex']




In [33]:
logreg_sel = LogisticRegression(
    multi_class='multinomial',
    random_state=42,
    **best_logreg_params
)
logreg_sel.fit(X_train_scaled_df[selected_logreg], y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,500


In [34]:
y_tr = logreg_sel.predict(X_train_scaled_df[selected_logreg])
y_te = logreg_sel.predict(X_test_scaled_df[selected_logreg])

print("\n--- Logistic Regression (seçilmiş özelliklerle) Performans ---")
print(f"train_acc : {accuracy_score(y_train, y_tr):.4f}")
print(f"test_acc  : {accuracy_score(y_test,  y_te):.4f}")
print(f"train_f1  : {f1_score(y_train, y_tr, average='weighted'):.4f}")
print(f"test_f1   : {f1_score(y_test,  y_te, average='weighted'):.4f}")
print(f"train_kappa: {cohen_kappa_score(y_train, y_tr):.4f}")
print(f"test_kappa : {cohen_kappa_score(y_test,  y_te):.4f}\n")

print("Classification Report (test):")
print(classification_report(y_test, y_te, target_names=le.classes_))


--- Logistic Regression (seçilmiş özelliklerle) Performans ---
train_acc : 0.8020
test_acc  : 0.7714
train_f1  : 0.8018
test_f1   : 0.7721
train_kappa: 0.7156
test_kappa : 0.6716

Classification Report (test):
              precision    recall  f1-score   support

          DF       0.84      0.82      0.83       193
          FW       0.76      0.73      0.74       136
          GK       1.00      1.00      1.00        33
          MF       0.68      0.71      0.69       176

    accuracy                           0.77       538
   macro avg       0.82      0.81      0.82       538
weighted avg       0.77      0.77      0.77       538



ANN

In [8]:
mlp = MLPClassifier(random_state=42, max_iter=1000)

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'alpha': [1e-4, 1e-3, 1e-2],
    'solver': ['adam', 'lbfgs']
}
grid_mlp = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)


In [9]:
grid_mlp.fit(X_train_scaled, y_train)
best_mlp = grid_mlp.best_estimator_
print("En iyi parametreler:", grid_mlp.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
En iyi parametreler: {'alpha': 0.01, 'hidden_layer_sizes': (100,), 'solver': 'adam'}


In [10]:
y_tr_pred = best_mlp.predict(X_train_scaled)
y_te_pred = best_mlp.predict(X_test_scaled)

metrics = {
    'train_acc': accuracy_score(y_train, y_tr_pred),
    'test_acc' : accuracy_score(y_test,  y_te_pred),
    'train_f1' : f1_score(y_train, y_tr_pred, average='weighted'),
    'test_f1'  : f1_score(y_test,  y_te_pred, average='weighted'),
    'train_kappa': cohen_kappa_score(y_train, y_tr_pred),
    'test_kappa' : cohen_kappa_score(y_test,  y_te_pred)
}
print("\n--- ANN Performans ---")
for k,v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))


--- ANN Performans ---
train_acc: 1.0000
test_acc: 0.7621
train_f1: 1.0000
test_f1: 0.7615
train_kappa: 1.0000
test_kappa: 0.6588

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.84      0.83      0.84       193
          FW       0.74      0.68      0.71       136
          GK       0.89      1.00      0.94        33
          MF       0.67      0.70      0.69       176

    accuracy                           0.76       538
   macro avg       0.79      0.80      0.79       538
weighted avg       0.76      0.76      0.76       538



ANN RFECV

In [11]:
def mlp_feature_importance(estimator):
    return np.sum(np.abs(estimator.coefs_[0]), axis=1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(
    estimator=best_mlp,
    step=1,
    cv=cv,
    scoring='f1_weighted',
    min_features_to_select=5,
    importance_getter=mlp_feature_importance,
    n_jobs=-1
)
rfecv.fit(X_train_scaled_df, y_train)
print("\nRFECV optimum özellik sayısı:", rfecv.n_features_)
selected_feats = feature_names[rfecv.support_]
print("Seçilen özellikler:", list(selected_feats))


RFECV optimum özellik sayısı: 33
Seçilen özellikler: ['age', 'PlayingContribution', 'ShotEfficiencyIndex', 'PenaltyImpactScore', 'ShotTypePreference', 'FreeKickProfile', 'TotalPassingVolume', 'ShortvsLongPassProfile', 'PassingAccuracy', 'PassProfileVariation', 'OffensivePlaymakerScore', 'ChainImpact', 'OpenPlayvsPieceCreativity', 'DribbleandFoulCreation', 'IndividualCreativity', 'DribbleActivityIndex', 'DribbleSuccessScore', 'BallProgressionIndex', 'RiskCarryingProfile', 'CarryEfficiencyScore', 'DefensiveActivityIndex', 'DefensiveZoneRecoveryProfile', 'DefensiveBlockingProfile', 'onetooneDefensiveVulnerability', 'PressingDefenderIndex', 'DefensiveReliabilityIndex', 'DisciplinaryAgressionIndex', 'FouledandOffsideTendency', 'PenaltyLiabilityIndex', 'OwnGoalPropensity', 'FairPlayProfile', 'AerialDuelActivity', 'AerialSuccessIndex']


In [12]:
mlp_sel = MLPClassifier(
    **grid_mlp.best_params_,
    random_state=42,
    max_iter=1000,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10
)
mlp_sel.fit(X_train_scaled_df[selected_feats], y_train)

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.01
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,1000
,shuffle,True


In [13]:
y_tr_sel = mlp_sel.predict(X_train_scaled_df[selected_feats])
y_te_sel = mlp_sel.predict(X_test_scaled_df[selected_feats])
print("\n--- MLP (seçilmiş özelliklerle) Performans ---")
print(f"Train Acc  : {accuracy_score(y_train, y_tr_sel):.4f}")
print(f"Test Acc   : {accuracy_score(y_test,  y_te_sel):.4f}")
print(f"Train F1   : {f1_score(y_train, y_tr_sel, average='weighted'):.4f}")
print(f"Test F1    : {f1_score(y_test,  y_te_sel,  average='weighted'):.4f}")
print(f"Train Kappa: {cohen_kappa_score(y_train, y_tr_sel):.4f}")
print(f"Test Kappa : {cohen_kappa_score(y_test,  y_te_sel):.4f}")
print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_sel, target_names=le.classes_))


--- MLP (seçilmiş özelliklerle) Performans ---
Train Acc  : 0.8373
Test Acc   : 0.7900
Train F1   : 0.8376
Test F1    : 0.7916
Train Kappa: 0.7665
Test Kappa : 0.6992

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.91      0.83      0.87       193
          FW       0.72      0.71      0.72       136
          GK       0.94      1.00      0.97        33
          MF       0.70      0.77      0.73       176

    accuracy                           0.79       538
   macro avg       0.82      0.83      0.82       538
weighted avg       0.80      0.79      0.79       538



SVM

In [14]:
svc = SVC(probability=True, random_state=42)
param_grid = {
    'C':      [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma':  ['scale', 'auto']
}
grid_svc = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [15]:
grid_svc.fit(X_train_scaled, y_train)
best_svc = grid_svc.best_estimator_
print("En iyi parametreler:", grid_svc.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
En iyi parametreler: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [16]:
y_tr_pred = best_svc.predict(X_train_scaled)
y_te_pred = best_svc.predict(X_test_scaled)
metrics = {
    'train_acc':  accuracy_score(y_train, y_tr_pred),
    'test_acc':   accuracy_score(y_test,  y_te_pred),
    'train_f1':   f1_score(y_train, y_tr_pred, average='weighted'),
    'test_f1':    f1_score(y_test,  y_te_pred, average='weighted'),
    'train_kappa':cohen_kappa_score(y_train, y_tr_pred),
    'test_kappa': cohen_kappa_score(y_test,  y_te_pred)
}
print("\n--- SVM Performans ---")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))


--- SVM Performans ---
train_acc: 0.8987
test_acc: 0.8030
train_f1: 0.8989
test_f1: 0.8048
train_kappa: 0.8545
test_kappa: 0.7166

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.92      0.83      0.88       193
          FW       0.78      0.68      0.73       136
          GK       1.00      1.00      1.00        33
          MF       0.69      0.82      0.75       176

    accuracy                           0.80       538
   macro avg       0.85      0.84      0.84       538
weighted avg       0.81      0.80      0.80       538



SVM RFECV

In [17]:
best_svc = grid_svc.best_estimator_
print("En iyi parametreler:", grid_svc.best_params_)
if best_svc.kernel == "linear":
    # doğrudan SVC(coef_) kullanabiliriz
    selector_est = best_svc
else:
    # rbf/poly için LinearSVC ile proxy seçim
    selector_est = LinearSVC(
        C=grid_svc.best_params_["C"],
        penalty="l1",
        dual=False,
        random_state=42,
        max_iter=5000
    )

En iyi parametreler: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(
    estimator=selector_est,
    step=1,
    cv=cv,
    scoring="f1_weighted",
    min_features_to_select=5,
    n_jobs=-1
)
rfecv.fit(X_train_scaled_df, y_train)

print(f"RFECV optimum özellik sayısı: {rfecv.n_features_}")
selected_feats = X_train_scaled_df.columns[rfecv.support_].tolist()
print("Seçilen özellikler:", selected_feats)

RFECV optimum özellik sayısı: 35
Seçilen özellikler: ['age', 'PlayingContribution', 'ShotEfficiencyIndex', 'PenaltyImpactScore', 'ShotTypePreference', 'FreeKickProfile', 'TotalPassingVolume', 'ShortvsLongPassProfile', 'PassingAccuracy', 'PassProfileVariation', 'OffensivePlaymakerScore', 'ChainImpact', 'DribbleandFoulCreation', 'CounterAttackInitiationScore', 'DribbleActivityIndex', 'DribbleSuccessScore', 'BallProgressionIndex', 'RiskCarryingProfile', 'CarryEfficiencyScore', 'OverallTouchActivity', 'TouchLocationIndex', 'FinalThirdPresence', 'DefensiveActivityIndex', 'DefensiveZoneRecoveryProfile', 'DefensiveBlockingProfile', 'onetooneDefensiveVulnerability', 'PressingDefenderIndex', 'DefensiveReliabilityIndex', 'DisciplinaryAgressionIndex', 'FouledandOffsideTendency', 'PenaltyLiabilityIndex', 'OwnGoalPropensity', 'FairPlayProfile', 'AerialDuelActivity', 'AerialSuccessIndex']


In [19]:
best_svc.fit(X_train_scaled_df[selected_feats], y_train)
y_tr = best_svc.predict(X_train_scaled_df[selected_feats])
y_te = best_svc.predict(X_test_scaled_df[selected_feats])

print("\n--- SVM (seçilmiş özelliklerle) Performans ---")
print(f"train_acc  : {accuracy_score(y_train, y_tr):.4f}")
print(f"test_acc   : {accuracy_score(y_test,  y_te):.4f}")
print(f"train_f1   : {f1_score(y_train, y_tr, average='weighted'):.4f}")
print(f"test_f1    : {f1_score(y_test,  y_te,  average='weighted'):.4f}")
print(f"train_kappa: {cohen_kappa_score(y_train, y_tr):.4f}")
print(f"test_kappa : {cohen_kappa_score(y_test,  y_te):.4f}\n")

print("Classification Report (test set):")
print(classification_report(y_test, y_te, target_names=le.classes_))


--- SVM (seçilmiş özelliklerle) Performans ---
train_acc  : 0.8996
test_acc   : 0.8104
train_f1   : 0.8998
test_f1    : 0.8119
train_kappa: 0.8559
test_kappa : 0.7275

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.92      0.84      0.88       193
          FW       0.78      0.71      0.74       136
          GK       1.00      1.00      1.00        33
          MF       0.71      0.82      0.76       176

    accuracy                           0.81       538
   macro avg       0.85      0.84      0.84       538
weighted avg       0.82      0.81      0.81       538



RANDOM FOREST

In [20]:
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth':    [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [21]:
grid_rf.fit(X_train_scaled, y_train)
best_rf = grid_rf.best_estimator_
print("En iyi parametreler:", grid_rf.best_params_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
En iyi parametreler: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 500}


In [22]:
y_tr_pred = best_rf.predict(X_train_scaled)
y_te_pred = best_rf.predict(X_test_scaled)

metrics = {
    'train_acc':  accuracy_score(y_train, y_tr_pred),
    'test_acc':   accuracy_score(y_test,  y_te_pred),
    'train_f1':   f1_score(y_train, y_tr_pred, average='weighted'),
    'test_f1':    f1_score(y_test,  y_te_pred, average='weighted'),
    'train_kappa':cohen_kappa_score(y_train, y_tr_pred),
    'test_kappa': cohen_kappa_score(y_test,  y_te_pred)
}
print("\n--- Random Forest Performans ---")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))


--- Random Forest Performans ---
train_acc: 0.9981
test_acc: 0.7695
train_f1: 0.9981
test_f1: 0.7722
train_kappa: 0.9973
test_kappa: 0.6687

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.86      0.79      0.82       193
          FW       0.80      0.71      0.75       136
          GK       1.00      1.00      1.00        33
          MF       0.64      0.75      0.69       176

    accuracy                           0.77       538
   macro avg       0.82      0.81      0.82       538
weighted avg       0.78      0.77      0.77       538



RANDOM FOREST RFECV

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv_rf = RFECV(
    estimator=rf,
    step=5,
    cv=cv,
    scoring="f1_weighted",
    min_features_to_select=10,
    n_jobs=-1
)
rfecv_rf.fit(X_train, y_train)

print(f"RFECV ile optimum özellik sayısı: {rfecv_rf.n_features_}")
selected_rf = X_train.columns[rfecv_rf.support_]
print("RFECV seçtiği değişkenler:", list(selected_rf))

RFECV ile optimum özellik sayısı: 38
RFECV seçtiği değişkenler: ['age', 'PlayingContribution', 'ShotEfficiencyIndex', 'PenaltyImpactScore', 'ShotTypePreference', 'FreeKickProfile', 'TotalPassingVolume', 'ShortvsLongPassProfile', 'PassingAccuracy', 'PassProfileVariation', 'OffensivePlaymakerScore', 'ChainImpact', 'OpenPlayvsPieceCreativity', 'DribbleandFoulCreation', 'IndividualCreativity', 'CounterAttackInitiationScore', 'DribbleActivityIndex', 'DribbleSuccessScore', 'BallProgressionIndex', 'RiskCarryingProfile', 'CarryEfficiencyScore', 'OverallTouchActivity', 'TouchLocationIndex', 'FinalThirdPresence', 'DefensiveActivityIndex', 'DefensiveZoneRecoveryProfile', 'DefensiveBlockingProfile', 'onetooneDefensiveVulnerability', 'PressingDefenderIndex', 'DefensiveReliabilityIndex', 'DisciplinaryAgressionIndex', 'FouledandOffsideTendency', 'PenaltyLiabilityIndex', 'OffsidevsPenaltyOutcome', 'OwnGoalPropensity', 'FairPlayProfile', 'AerialDuelActivity', 'AerialSuccessIndex']


Yeniden eğitelim

In [26]:
rf_sel = RandomForestClassifier(**grid_rf.best_params_, random_state=42)
rf_sel.fit(X_train_scaled_df[selected_rf], y_train)

y_tr = rf_sel.predict(X_train_scaled_df[selected_rf])
y_te = rf_sel.predict(X_test_scaled_df[selected_rf])

print("--- RF (seçilmiş özelliklerle) Performans ---")
print(f"train_acc : {accuracy_score(y_train, y_tr):.4f}")
print(f"test_acc  : {accuracy_score(y_test,  y_te):.4f}")
print(f"train_f1  : {f1_score(y_train, y_tr, average='weighted'):.4f}")
print(f"test_f1   : {f1_score(y_test,  y_te, average='weighted'):.4f}")
print(f"train_kappa: {cohen_kappa_score(y_train, y_tr):.4f}")
print(f"test_kappa : {cohen_kappa_score(y_test,  y_te):.4f}\n")

print("Classification Report (test):")
print(classification_report(y_test, y_te, target_names=le.classes_))

--- RF (seçilmiş özelliklerle) Performans ---
train_acc : 0.9981
test_acc  : 0.7695
train_f1  : 0.9981
test_f1   : 0.7722
train_kappa: 0.9973
test_kappa : 0.6687

Classification Report (test):
              precision    recall  f1-score   support

          DF       0.86      0.79      0.82       193
          FW       0.80      0.71      0.75       136
          GK       1.00      1.00      1.00        33
          MF       0.64      0.75      0.69       176

    accuracy                           0.77       538
   macro avg       0.82      0.81      0.82       538
weighted avg       0.78      0.77      0.77       538



XGBOOST

In [27]:
xgb_base = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
param_grid = {
    'n_estimators':    [100, 200, 300],
    'max_depth':       [3, 5, 7],
    'learning_rate':   [0.01, 0.1, 0.2],
    'subsample':       [0.6, 0.8, 1.0],
    'colsample_bytree':[0.6, 0.8, 1.0]
}
grid_xgb = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [28]:
grid_xgb.fit(X_train_scaled, y_train)
best_xgb = grid_xgb.best_estimator_
print("En iyi parametreler:", grid_xgb.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


En iyi parametreler: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


In [29]:
y_tr_pred = best_xgb.predict(X_train_scaled)
y_te_pred = best_xgb.predict(X_test_scaled)

metrics = {
    'train_acc':   accuracy_score(y_train, y_tr_pred),
    'test_acc':    accuracy_score(y_test,  y_te_pred),
    'train_f1':    f1_score(y_train, y_tr_pred, average='weighted'),
    'test_f1':     f1_score(y_test,  y_te_pred, average='weighted'),
    'train_kappa': cohen_kappa_score(y_train, y_tr_pred),
    'test_kappa':  cohen_kappa_score(y_test,  y_te_pred)
}


In [56]:
print("\n--- XGBoost Performans ---")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))


--- XGBoost Performans ---
train_acc: 0.9768
test_acc: 0.7974
train_f1: 0.9767
test_f1: 0.7982
train_kappa: 0.9666
test_kappa: 0.7088

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.89      0.85      0.87       193
          FW       0.75      0.70      0.73       136
          GK       1.00      1.00      1.00        33
          MF       0.70      0.77      0.74       176

    accuracy                           0.80       538
   macro avg       0.84      0.83      0.83       538
weighted avg       0.80      0.80      0.80       538



XGBOOST FEATURE SELECTION

In [30]:
importances = pd.Series(best_xgb.feature_importances_, index=feature_names)
top15 = importances.sort_values(ascending=False).head(15)
print("XGBoost — En önemli 15 değişken:\n", top15, "\n")

XGBoost — En önemli 15 değişken:
 RiskCarryingProfile             0.084608
DefensiveZoneRecoveryProfile    0.079517
TotalPassingVolume              0.068193
OverallTouchActivity            0.062915
ShortvsLongPassProfile          0.050410
DefensiveBlockingProfile        0.040968
DefensiveActivityIndex          0.040631
FairPlayProfile                 0.037741
DefensiveReliabilityIndex       0.031169
FouledandOffsideTendency        0.026921
ShotEfficiencyIndex             0.026820
DribbleActivityIndex            0.022565
CarryEfficiencyScore            0.022543
PassingAccuracy                 0.022103
OffensivePlaymakerScore         0.022042
dtype: float32 



In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv_xgb = RFECV(
    estimator= XGBClassifier(
        **grid_xgb.best_params_,
        random_state=42,
        use_label_encoder=False,
        eval_metric="mlogloss",
        n_jobs=-1
    ),
    step=1,
    cv=cv,
    scoring="f1_weighted",
    min_features_to_select=5,
    n_jobs=-1
)
rfecv_xgb.fit(X_train_scaled_df, y_train)

print(f"RFECV optimum özellik sayısı: {rfecv_xgb.n_features_}")
selected_xgb = X_train_scaled_df.columns[rfecv_xgb.support_]
print("RFECV seçtiği değişkenler:\n", list(selected_xgb), "\n")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RFECV optimum özellik sayısı: 31
RFECV seçtiği değişkenler:
 ['PlayingContribution', 'ShotEfficiencyIndex', 'ShotTypePreference', 'FreeKickProfile', 'TotalPassingVolume', 'ShortvsLongPassProfile', 'PassingAccuracy', 'PassProfileVariation', 'OffensivePlaymakerScore', 'ChainImpact', 'DribbleandFoulCreation', 'IndividualCreativity', 'CounterAttackInitiationScore', 'DribbleActivityIndex', 'BallProgressionIndex', 'RiskCarryingProfile', 'CarryEfficiencyScore', 'OverallTouchActivity', 'FinalThirdPresence', 'DefensiveActivityIndex', 'DefensiveZoneRecoveryProfile', 'DefensiveBlockingProfile', 'onetooneDefensiveVulnerability', 'PressingDefenderIndex', 'DefensiveReliabilityIndex', 'DisciplinaryAgressionIndex', 'FouledandOffsideTendency', 'PenaltyLiabilityIndex', 'FairPlayProfile', 'AerialDuelActivity', 'AerialSuccessIndex'] 



In [62]:
xgb_sel = XGBClassifier(
    **grid_xgb.best_params_,
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss",
    n_jobs=-1
)
xgb_sel.fit(X_train_scaled_df[selected_xgb], y_train)

y_tr_pred = xgb_sel.predict(X_train_scaled_df[selected_xgb])
y_te_pred = xgb_sel.predict(X_test_scaled_df[selected_xgb])

print("--- XGBoost (seçilmiş özellikler) Performans ---")
print(f"train_acc  : {accuracy_score(y_train, y_tr_pred):.4f}")
print(f"test_acc   : {accuracy_score(y_test,  y_te_pred):.4f}")
print(f"train_f1   : {f1_score(y_train, y_tr_pred, average='weighted'):.4f}")
print(f"test_f1    : {f1_score(y_test,  y_te_pred, average='weighted'):.4f}")
print(f"train_kappa: {cohen_kappa_score(y_train, y_tr_pred):.4f}")
print(f"test_kappa : {cohen_kappa_score(y_test,  y_te_pred):.4f}\n")

print("Classification Report (test set):")
print(classification_report(y_test, y_te_pred, target_names=le.classes_))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost (seçilmiş özellikler) Performans ---
train_acc  : 0.9735
test_acc   : 0.7918
train_f1   : 0.9735
test_f1    : 0.7922
train_kappa: 0.9620
test_kappa : 0.7007

Classification Report (test set):
              precision    recall  f1-score   support

          DF       0.86      0.85      0.86       193
          FW       0.76      0.71      0.73       136
          GK       1.00      1.00      1.00        33
          MF       0.71      0.76      0.73       176

    accuracy                           0.79       538
   macro avg       0.83      0.83      0.83       538
weighted avg       0.79      0.79      0.79       538

