# 0. Libraries and importing the data
In this case, we will only import the training data, as this step in the machine learning pipeline involves feature selection and hyperparameter tuning.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

# Hyperparameter tuning and cross-validation
from sklearn.model_selection import RandomizedSearchCV

# For creating a pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For preprocessing
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Extracting scores for all classes
from sklearn.metrics import classification_report

# Utilities
from scipy.stats import uniform
from scipy.stats import randint

# We filter out FutureWarnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

## Preprocessing version 1

In [4]:
dfv1 = pd.read_csv('2_preprocessed_datasets/v1_train.csv')
dfv1

Unnamed: 0,id,short_name,overall,potential,value_eur,wage_eur,height_cm,weight_kg,club_name,league_level,...,real_face_Yes,work_rate_High/High,work_rate_High/Low,work_rate_High/Medium,work_rate_Low/High,work_rate_Low/Low,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,work_rate_Medium/Medium
0,216302,E. García,71,71,1400000.0,10000,176,73,Club Atlético de San Luis,1,...,False,False,False,False,False,False,False,True,False,False
1,237867,D. Cancola,65,71,1000000.0,2000,183,73,Ross County FC,1,...,False,False,False,False,False,False,False,False,False,True
2,253472,E. Kahl,65,77,1600000.0,2000,178,69,Aarhus GF,1,...,False,False,False,True,False,False,False,False,False,False
3,223994,S. Mugoša,72,72,2300000.0,5000,188,81,Incheon United FC,1,...,False,False,False,True,False,False,False,False,False,False
4,251635,A. Țigănașu,65,65,525000.0,3000,179,74,FC Botoşani,1,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6890,242007,C. Whelan,59,66,425000.0,2000,180,65,Carlisle United,4,...,False,False,False,False,False,False,False,False,False,True
6891,187961,Paulinho,83,83,28500000.0,61000,183,80,Al Ahli,1,...,True,True,False,False,False,False,False,False,False,False
6892,257234,Y. Hamache,70,80,3400000.0,6000,177,73,Boavista FC,1,...,False,False,False,True,False,False,False,False,False,False
6893,232511,S. Sasaki,71,71,1300000.0,7000,176,70,Sanfrecce Hiroshima,1,...,False,False,False,False,False,False,False,True,False,False


In [8]:
dfv1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6895 entries, 0 to 6894
Data columns (total 70 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           6895 non-null   int64  
 1   short_name                   6895 non-null   object 
 2   overall                      6895 non-null   int64  
 3   potential                    6895 non-null   int64  
 4   value_eur                    6893 non-null   float64
 5   wage_eur                     6895 non-null   int64  
 6   height_cm                    6895 non-null   int64  
 7   weight_kg                    6895 non-null   int64  
 8   club_name                    6895 non-null   object 
 9   league_level                 6895 non-null   int64  
 10  club_jersey_number           6895 non-null   int64  
 11  club_contract_valid_until    6895 non-null   int64  
 12  nationality_name             6895 non-null   object 
 13  weak_foot         

In [9]:
# Count of null values per column
dfv1_na = dfv1.isnull().sum()

# Filter only columns with missing values
missing_values_tr = dfv1_na[dfv1_na > 0]

# Compute percentage of missing values
missing_values_percentage = (missing_values_tr / len(dfv1)) * 100

# Display the result
print("Columns with missing values:")
print(missing_values_tr)
print("\nPercentage of missing values:")
print(missing_values_percentage)

Columns with missing values:
value_eur               2
release_clause_eur    353
pace                  631
shooting              631
passing               631
dribbling             631
defending             631
physic                631
dtype: int64

Percentage of missing values:
value_eur             0.029007
release_clause_eur    5.119652
pace                  9.151559
shooting              9.151559
passing               9.151559
dribbling             9.151559
defending             9.151559
physic                9.151559
dtype: float64


## Preliminary considerations: class imbalance
First of all, for evaluating the model it is worth noticing, again, that the dataset is highly imbalanced. The plots below show this characteristic of the dataset, where the first plot shows the relative frequencies of the positions (labels) with all of the possible positions and the second one shows the relative frequencies once we take out from the dataset goalkeepers (which will not be included in the model training, as it is possible to make a perfect prediction for that class *GK* with any of the variables *pace, shooting, passing, dribbling, defending* or *physic*, where missing values here correspond to goalkeepers).

### Imbalancing in the training dataset

In [20]:
# We save the relative frequencies of the 'position_grouped' column
relative_freq = dfv1['position_grouped'].value_counts(normalize=True).reset_index()

# We change the naming of the columns
relative_freq.columns = ['Position', 'Relative Frequency']

# Create a bar plot with Plotly
fig = px.bar(relative_freq, x='Position', y='Relative Frequency',
                title=f'Relative Frequency Barplot of the variable Position')

# Show the plot
fig.show()

In [21]:
# Now, we leave out the goalkeeper class
dfv1_nogk = dfv1[dfv1['position'] != 'GK']

# We save the relative frequencies of the 'position_grouped' column
relative_freq = dfv1_nogk['position_grouped'].value_counts(normalize=True).reset_index()

# We change the naming of the columns
relative_freq.columns = ['Position', 'Relative Frequency']

# Create a bar plot with Plotly
fig = px.bar(relative_freq, x='Position', y='Relative Frequency',
                title=f'Relative Frequency Barplot of the variable Position (leaving out goalkeepers)')

# Show the plot
fig.show()

In [34]:
# Count of null values per column
dfv1_nogk_na = dfv1_nogk.isnull().sum()

# Filter only columns with missing values
missing_values_tr = dfv1_nogk_na[dfv1_nogk_na > 0]

# Compute percentage of missing values
missing_values_percentage = (missing_values_tr / len(dfv1_nogk)) * 100

# Display the result
print("Columns with missing values:")
print(missing_values_tr)
print("\nPercentage of missing values:")
print(missing_values_percentage)

Columns with missing values:
value_eur               2
release_clause_eur    320
dtype: int64

Percentage of missing values:
value_eur             0.031928
release_clause_eur    5.108557
dtype: float64


### Insights

It can be observed that there are some classes that are highly overrepresented (especially LCB and RCB, which add up to 20% of the observations in the training dataset). In this context, then, some considerations must be made for the training, evaluation and prediction with this type of dataset:
- **Metric trap**: "Relying on simplistic metrics like accuracy_score can lead to misleading results. In datasets where classes are highly imbalanced, if a classifier consistently predicts the most prevalent class without considering any feature analysis, it can achieve a high accuracy rate [or any other performance measure] that is clearly deceptive."
- **Solutions for addressing the problem of imbalanced datasets**:
    - *Possible solution to metric trap: confusion matrix*. Given that we have to predict for different classes, maybe it would also be possible to compute the performance metric (such as the F1-score) for each position (or groups of positions), in order to have a more complete picture of the predictive power of the model.
    - *Resampling methodologies*. Mainly, oversampling the underrepresented classes and/or undersampling the majority classes through random resampling ("often with replacement"). See application in notebook 9 from class. Limitations:
        - "Over-sampling, for instance, can lead to overfitting when the minority class is duplicated indiscriminately."
        - "Under-sampling, which involves randomly removing records from the majority class, can result in information loss."
        - Alternative: SMOTE (Synthetic Minority Over-Sampling Technique). "It helps balance class distribution and reduce the impact of class imbalance on classification models." The limitation, of course, is that synthetic observations may not be accurate representations of their real counterparts. See section 1.3.2. of the notebook 9 from class.
- Finally, the models that are implemented below should apply **multiclass algorithms** (as, in this case, we do not have a binary classification problem). 

# 3. Feature selection and evaluation
Things to do in this section:
1. **Split** the data.
2. Impute **missing values** (if there is no listwise deletion).
3. Clean **outliers** (imputation or deletion).
4. **Standardization** of numerical variables (needed by KNN and for feature selection with logistic regression). Apply same standardization to validation data. No further transformations will be needed if categorical variables have been OHE or encoded in some other way.
5. Deal with **class imbalancing** for the training data (from `imblearn` - `over_sampling`, `under_sampling`, import `RandomUnderSampler`, `RandomOverSampler` and `SMOTE`).
6. **Hyperparameter tuning** (grid search or randomized search - `RandomizedSearchCV`).
7. Evaluate model through **cross-validation** (considering class imbalancing, stratified cross-validation is probably the best option).
8. Rank models with **evaluation metrics** (mainly: precision, recall, F1-score and AUC).
9. After finding the best-performing model (in this case, guided by the best F1-score), train the best model with the whole training dataset and the optimal hyperparameters.

How and when to apply resampling methodologies:
- Resampling methodologies should be applied after the train-validation split.
- Explanation:
    - Avoid information leakage: Resampling before splitting can cause synthetic data (oversampling) or missing data (undersampling) from the same original observations to appear in both train and validation sets. This results in overly optimistic validation performance.
    - True model evaluation: Resampling alters the class distribution, which should only affect the training data. The validation set should reflect the original distribution for an accurate evaluation of how the model would perform in the real world.
- Process:
    - Split the data into training and validation sets.
    - Preprocessing and feature engineering (so cleaning missing values, outliers and standardizing come BEFORE the application of resampling techniques, which can be affected if these transformations are not applied before). Concretely, the application of SMOTE would be biased if numerical features are not scaled beforehand.
    - Apply resampling techniques (oversampling, undersampling, or SMOTE) only on the training set.
    - Train the model on the resampled training set.
    - Evaluate the model on the original, untouched validation set.

Stratified cross-validation and resampling. Does it make sense to apply both of them?
- Without Resampling: Stratified cross-validation is ideal for imbalanced datasets because it ensures that each fold maintains the same class distribution as the original dataset.
- With Resampling: Stratification is less relevant because the class distribution in the training folds will be altered by resampling. However, you can still use stratified cross-validation on the unresampled validation set to evaluate the model. The goal here is to maintain the real-world class distribution in the validation set while resampling the training data within each fold.
- Suggested Strategy:
    - Use stratified splits for train-validation splitting.
    - Apply resampling within the training folds during each iteration of cross-validation.

## Baseline models
Below, we train and evaluate some baseline models, with and without resampling methodologies, in order to get an initial grasp of which model may have a higher predictive power and whether dealing with class imbalancing is worth the effort or not considering the performance metric that matters the most in this case (the F1-score).

In the models below, we:
1. **Split** the data.
2. Impute **missing values** with a simple imputer (median).
3. Don't do anything to **outliers**.
4. **Standardization** numerical variables for KNN and logistic regression. Apply same standardization to validation data.
5. (Not applied for the first baseline models, applied in the second) Deal with **class imbalancing** for the training data (from `imblearn` - `over_sampling`, `under_sampling`, import `RandomUnderSampler`, `RandomOverSampler` and `SMOTE`).
6. **Hyperparameter tuning** (randomized search - `RandomizedSearchCV`).
7. Evaluate model through **stratified cross-validation**.
8. Rank models with **evaluation metrics** (mainly: F1-score).

All of the models below have been trained with the same features, where we discard those which we have seen that do not have much predictive power or that are highly correlated with other features (so that they are not expected to add much predictive power). The main reasons for this choice are the correlation matrix from the EDA, the boxplots from EDA and preprocessing and the heatmaps of deviations for categorical variables, which have been used here as **filtering methods**. Specifically, we have excluded:

### Feature selection (lists of features)
**NUMERICAL VARIABLES**:
- *overall, potential*, as they do not have much predictive power.
- *wage_eur* is not expected to have a high predictive power. On the other hand, *release_clause_eur* has a linear correlation of 1 with *value_eur*, so the first is discarded.
- Attacking variables (2/5 removed):
    - *attacking_short_passing* is highly correlated with *attacking_crossing* (0.78), and the former has less predictive power than the latter. Thus, the first will not be included.
    - By the same reason, *attacking_volleys* (with a correlation of 0.89 with *attacking_finishing*) won't be included either.
- Skill variables (3/5 removed):
    - *skill_dribbling, skill_fk_accuracy* and *skill_ball_control* are removed due to their high correlation with *skill_curve* and because they have similar variability across positions.
    - *skill_long_passing* shows different variability, so it is kept.
- Movement variables (4/5 removed):
    - *movement_sprint_speed*, *movement_agility* and *movement_balance* are removed, due to their very high correlation with *movement_acceleration* and similar variability (though *movement_balance* has a correlation of "only" 0.71).
    - *movement_reactions* is also removed due to an apparently low predictive power.
- Power variables (3/5 removed):
    - *power_shot_power* is removed, due to the high correlation (0.8) and similar variability to *power_long_shots* (which seems to have higher variability).
    - *power_stamina* is also removed, since it does not have significant variability across non-GK positions.
    - *power_jumping* exhibits very similar variability across positions to *power_strenght*, but the latter seems more discriminatory. Therefore, the former is dropped.
- Mentality variables (3/5 removed):
    - *mentality_aggression* is highly correlated with *mentality_interceptions* and shows similar variability across positions (but with less apparent discriminatory power), so the first is dropped. 
    - *mentality_positioning* and *mentality_vision* are highly correlated with *mentality_penalties* and have similar variability across positions, so the first two are removed.
    - *mentality_composture* does not exhibit significant variability, so it is removed.
- Defending features (2/3 removed): all defending features are very highly correlated (> 0.9), so we only keep the one with the highest apparent discriminatory power, *defending_marking_awareness*.
- *pace*, *shooting*, *passing*, *dribbling* and *defending* are all kept.

**CATEGORICAL VARIABLES**:
- Excluded all features which are not expected to have a high predictive power: *club_contract_valid_until*, *league_level*, *club_jersey_number*, *short_name*, *club_name*, *nationality_name*, *league_country*, *real_face*.
- It is also important to remember to drop one dummy for KNN and logistic regression (but it is better to keep it for the decision tree).

In [17]:
# List of numerical features (22 in total)
numerical_features = [
    'height_cm', 'weight_kg',
    'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy',
    'skill_curve', 'skill_long_passing','movement_acceleration', 'power_strength', 'power_long_shots',
    'mentality_interceptions', 'mentality_penalties', 'defending_marking_awareness', 
    'value_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 
    'physic', 'age', 'height_weight_ratio'
]

# List of categorical ordinal features (3 in total)
categorical_ordinal_features = [
    'weak_foot', 'skill_moves', 'international_reputation'
]

# List of categorical nominal features (11 in total)
categorical_nominal_features_all_dummies = [
    'preferred_foot_Right',	
    'preferred_foot_Left', 'work_rate_High/High', 'work_rate_High/Low', 'work_rate_High/Medium',
    'work_rate_Low/High', 'work_rate_Low/Low', 'work_rate_Low/Medium',
    'work_rate_Medium/High', 'work_rate_Medium/Low',
    'work_rate_Medium/Medium'
]

# List of categorical nominal features (9 in total)
categorical_nominal_features = [
    'preferred_foot_Right',	'work_rate_High/Low', 'work_rate_High/Medium',
    'work_rate_Low/High', 'work_rate_Low/Low', 'work_rate_Low/Medium',
    'work_rate_Medium/High', 'work_rate_Medium/Low',
    'work_rate_Medium/Medium'
]

# Complete list of categorical features (with all dummies, for decision trees) (36 features)
categorical_features_all_dummies = categorical_nominal_features_all_dummies + categorical_ordinal_features

# Complete list of categorical features (with k-1 dummies, for KNN and logistic regresion) (34 features)
categorical_features = categorical_nominal_features + categorical_ordinal_features

### General training function

In [15]:
def baseline_models(
        df: pd.DataFrame, numerical_features: list[str], categorical_features: list[str], 
        model, target: str, n_hyper_comb: int = 10, n_folds: int = 5, num_imputer: str = "median",
        goalkeeper_label: str = "GK"
):
    
    """
    Function to evaluate baseline machine learning models (Logistic Regression, K-Nearest Neighbors, 
    Decision Trees) for a classification task, with hyperparameter tuning using RandomizedSearchCV. 
    It preprocesses numerical and categorical features, performs model fitting, and evaluates 
    performance on the entire dataset. Goalkeeper (GK) players are handled separately with special 
    assumptions based on the missing 'pace' feature.

    Parameters:
    -----------
    df : pd.DataFrame
        The input dataset containing both numerical and categorical features as well as the target variable.
        
    numerical_features : list[str]
        List of column names representing numerical features in the dataset.

    categorical_features : list[str]
        List of column names representing categorical features in the dataset.
        
    model : class
        The model class to be used for training and evaluation (LogisticRegression, KNeighborsClassifier, 
        or DecisionTreeClassifier).
        
    target : str
        The name of the target variable (i.e., the variable to be predicted).
        
    n_hyper_comb : int, default=10
        The number of hyperparameter combinations to sample for RandomizedSearchCV.

    n_folds : int, default=5
        The number of folds to use in cross-validation for RandomizedSearchCV.
        
    num_imputer : str, default="median"
        The strategy to use for imputing missing values in numerical features (e.g., "mean", "median").
        
    goalkeeper_label : str, default="GK"
        The label to assign to goalkeepers (assumed to have missing 'pace' feature values).

    Returns:
    --------
    df_combined_report : pd.DataFrame
        DataFrame containing the classification report with metrics for all classes, including goalkeepers.
        
    optimal_hyperparameters : dict
        A dictionary of the optimal hyperparameters found by RandomizedSearchCV.
    """

    # We check if the model is one of the allowed model classes
    allowed_models = (LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier)
    
    if model not in allowed_models:
        raise ValueError(f"Invalid model type. Allowed models are: {allowed_models}")

    ###########################################################################

    # First, we create the pipeline for preprocessing numerical and categorical transformers

    # For numerical variables, we do median imputation
    num_prepr = [
        ("imputer", SimpleImputer(strategy = num_imputer))
    ]

    # We add standardization for certain models that benefit from it
    if model == LogisticRegression or model == KNeighborsClassifier:
        num_prepr.append(("standardize", StandardScaler()))

    # For categorical variables, for now, we don't do anything (pass through)
    cat_prepr = [
        ("passthrough", "passthrough")
    ]

    # Create the ColumnTransformer to apply different transformations to numerical
    # and categorical variables
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(num_prepr), numerical_features),
            ('cat', Pipeline(cat_prepr), categorical_features),
        ]
    )

    # Create steps of the pipeline, which begins with the preprocessor. 
    steps = [('preprocessor', preprocessor)]

    ###########################################################################

    # Second, depending on the model that has been chosen, we append one model
    # or another, and define the set of hyperparameters accordingly

    if model == LogisticRegression:
        steps.append(('logreg', 
                                OneVsRestClassifier(model( # With OVR, we make each class a binary problem (one-versus-the-rest)
                                fit_intercept = True,
                                max_iter = 1000,
                                penalty = 'l2', # Ridge regularization
                                ))))
        distributions = {
        'logreg__estimator__C': uniform(loc=0.01, scale=10),
        'logreg__estimator__class_weight': [None, 'balanced']
        }

    elif model == KNeighborsClassifier:
        steps.append(('knn', model()))
        distributions = {
        'knn__n_neighbors': randint(low = 1, high = 25),
        'knn__weights': ['uniform', 'distance']
        }
    
    elif model == DecisionTreeClassifier:
        steps.append(('dtc', 
                                model(
                                random_state = 17, # Ridge regularization
                                )))
        distributions = {
        'dtc__max_depth': randint(low = 1, high = 100),
        'dtc__max_features': randint(low = 1, high = (len(df[numerical_features + categorical_features].columns))),
        'dtc__max_leaf_nodes': randint(low = 1, high = 5000),
        'dtc__min_samples_split': uniform(loc=0.001, scale=0.199),
        'dtc__min_samples_leaf': uniform(loc=0.001, scale=0.199),
        'dtc__class_weight': [None, 'balanced']
        }
    
    ###########################################################################

    # Third, we create the pipeline with the preprocessing and the model
    pipeline = Pipeline(steps)

    ###########################################################################

    # Fourth, we split the dataset into goalkeepers and non-goalkeepers: those
    # with missing values in the "pace" variable are assumed to be GKs (which
    # is true according to what was found in the EDA)

    df_gk = df[df['pace'].isna()].copy()
    df_nongk = df[~df['pace'].isna()].copy()

    # We divide the data frames depending on the variables

    X_nongk = df_nongk[numerical_features + categorical_features]
    y_nongk = df_nongk[target]

    y_gk = df_gk[target]

    ###########################################################################

    # Fifth, we do the randomized search with the model (only with the data w/o GK)
    
    clf = RandomizedSearchCV(
        estimator = pipeline, 
        param_distributions = distributions,
        n_iter = n_hyper_comb, # Default is 10
        scoring = ['f1_weighted', 'roc_auc_ovr_weighted'],
        n_jobs = -1, # We use all available processors
        cv = n_folds, # We use stratified cross-validation with 5 folds
        verbose = 1,
        refit = 'f1_weighted', # Refit an estimator using the best found parameters on the whole dataset with the best 'f1_weighted' found
        random_state = 17, 
        error_score = np.nan, 
        return_train_score = False)

    search = clf.fit(X_nongk, y_nongk)

    ###########################################################################

    # Sixth, we save the main results from the search

    # Best f1_weighted score from the search
    best_f1_weighted_score = search.best_score_

    # Best hyperparameters from the search
    optimal_hyperparameters = search.best_params_

    ###########################################################################

    # Seventh, we run the predictions training with the optimal hyperparameters 
    # the whole dataset (note that this can lead to misleading results with higher
    # evaluation metrics than what we would get with the test set, as the
    # prediction is made with the same training dataset; in any case it can be useful
    # for indicating the labels which are missclassified more often)

    # Predictions for non-goalkeepers
    y_pred_nongk = search.best_estimator_.predict(X_nongk)

    # Predictions for goalkeepers: all observations here are predicted as GK
    y_pred_gk = [goalkeeper_label] * len(df_gk)

    # Combine predictions into a single dataframe
    y_combined = pd.concat([y_nongk, y_gk], axis=0)
    y_pred_combined = pd.concat([pd.Series(y_pred_nongk), pd.Series(y_pred_gk)], axis=0, ignore_index = True)

    ###########################################################################

    # Eigth, we evaluate combined metrics (note that we set that metrics with warnings
    # will be set to 0, for instance if we get a precision = 0 for some class)
    combined_report_dict = classification_report(y_combined, y_pred_combined, output_dict = True, zero_division = 0)
    df_combined_report = pd.DataFrame(combined_report_dict)

    ###########################################################################

    # Ninth, we return the results (metrics for classes computed for the training
    # set), including the best hyperparameters found - note that the results from 
    # KNN in df_combined_report will be entirely misleading, as the prediction 
    # is made with the same training data (so in that case, we just consider the 
    # best parameters and best score from the randomized search)

    if model == KNeighborsClassifier:
        return optimal_hyperparameters, best_f1_weighted_score
    else:
        return df_combined_report, optimal_hyperparameters, best_f1_weighted_score

Useful documentation:
- Randomized Search: https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
- Scoring: https://scikit-learn.org/1.5/modules/model_evaluation.html#scoring-parameter
- Pipeline: https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
- Models:
    - One vs rest classifier (for multiclass classification): https://scikit-learn.org/1.5/modules/generated/sklearn.multiclass.OneVsRestClassifier.html
    - Logistic Regression: https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
    - Decision trees in sklearn can handle directly multiclass classification: "DecisionTreeClassifier is capable of both binary (where the labels are [-1, 1]) classification and multiclass (where the labels are [0, …, K-1]) classification." (https://scikit-learn.org/1.5/modules/tree.html). Also, see https://scikit-learn.org/1.5/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    - KNN Classifier: https://scikit-learn.org/1.5/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

### Without considering class imbalancing

#### Logistic regression

In [100]:
df_report_logreg, best_params_logreg, f1_w_logreg = baseline_models(
    df = dfv1, numerical_features = numerical_features, categorical_features = categorical_features, 
    model = LogisticRegression, target = 'position_grouped', n_hyper_comb = 20, n_folds = 5, num_imputer = "median",
    goalkeeper_label = "G_GK"
    )

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [101]:
df_report_logreg

Unnamed: 0,A_LF,A_LS,A_LW,A_RF,A_RS,A_RW,A_ST,D_CB,D_LB,D_LCB,...,M_CM,M_LCM,M_LDM,M_LM,M_RCM,M_RDM,M_RM,accuracy,macro avg,weighted avg
precision,0.5,0.238095,0.342105,0.0,0.142857,0.486486,0.482846,1.0,0.596273,0.660622,...,0.0,0.291971,0.285714,0.282878,0.292035,0.34,0.288177,0.48702,0.394028,0.472567
recall,0.03125,0.049751,0.078313,0.0,0.00995,0.108434,0.88785,0.011976,0.829374,0.40412,...,0.0,0.378251,0.079602,0.308943,0.468085,0.084577,0.317073,0.48702,0.300556,0.48702
f1-score,0.058824,0.082305,0.127451,0.0,0.018605,0.17734,0.625514,0.023669,0.693767,0.501475,...,0.0,0.329557,0.124514,0.295337,0.359673,0.135458,0.301935,0.48702,0.276898,0.43264
support,32.0,201.0,166.0,32.0,201.0,166.0,428.0,167.0,463.0,631.0,...,75.0,423.0,201.0,369.0,423.0,201.0,369.0,0.48702,6895.0,6895.0


In [102]:
best_params_logreg

{'logreg__estimator__C': 6.534186154656548,
 'logreg__estimator__class_weight': None}

In [103]:
f1_w_logreg

0.3493697288467855

#### K-Nearest Neighbors

In [18]:
best_params_knn, f1_w_knn = baseline_models(
    df = dfv1, numerical_features = numerical_features, categorical_features = categorical_features, 
    model = KNeighborsClassifier, target = 'position_grouped', n_hyper_comb = 20, n_folds = 5, num_imputer = "median",
    goalkeeper_label = "G_GK"
    )

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [19]:
best_params_knn

{'knn__n_neighbors': 17, 'knn__weights': 'distance'}

In [20]:
f1_w_knn

0.30547197916462554

#### Decision Tree

In [21]:
df_report_dt, best_params_dt, f1_w_dt = baseline_models(
    df = dfv1, numerical_features = numerical_features, categorical_features = categorical_features_all_dummies, 
    model = DecisionTreeClassifier, target = 'position_grouped', n_hyper_comb = 100, n_folds = 5, num_imputer = "median",
    goalkeeper_label = "G_GK"
    )

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [108]:
df_report_dt

Unnamed: 0,A_LF,A_LS,A_LW,A_RF,A_RS,A_RW,A_ST,D_CB,D_LB,D_LCB,...,M_CM,M_LCM,M_LDM,M_LM,M_RCM,M_RDM,M_RM,accuracy,macro avg,weighted avg
precision,0.0,0.363095,0.275862,0.0,0.34375,0.2125,0.542714,0.0,0.570552,0.56006,...,0.0,0.337545,0.235669,0.327014,0.353635,0.245455,0.386423,0.498912,0.298748,0.439437
recall,0.0,0.303483,0.144578,0.0,0.164179,0.10241,0.757009,0.0,0.803456,0.591125,...,0.0,0.44208,0.18408,0.373984,0.425532,0.134328,0.401084,0.498912,0.317802,0.498912
f1-score,0.0,0.330623,0.189723,0.0,0.222222,0.138211,0.632195,0.0,0.667265,0.575173,...,0.0,0.382805,0.206704,0.348925,0.386266,0.173633,0.393617,0.498912,0.299714,0.459349
support,32.0,201.0,166.0,32.0,201.0,166.0,428.0,167.0,463.0,631.0,...,75.0,423.0,201.0,369.0,423.0,201.0,369.0,0.498912,6895.0,6895.0


In [22]:
best_params_dt

{'dtc__class_weight': None,
 'dtc__max_depth': 40,
 'dtc__max_features': 24,
 'dtc__max_leaf_nodes': 1244,
 'dtc__min_samples_leaf': 0.0013015285199047416,
 'dtc__min_samples_split': 0.010912693744184222}

In [23]:
f1_w_dt

0.3014515619237356

#### Insights
Overall, *logistic regression* is the best performer in this case, while a single decision tree and KNN exhibit a similar predictive power.

### Considering class imbalancing

### Extending the number of features (without considering class imbalancing)
Now, we keep all of the available features (except those that have not been preprocessed yet). Below, we only exclude:
- Excluded all features which are not expected to have a high predictive power: *club_contract_valid_until*, *short_name*, *club_name*, *nationality_name*, *league_country*, *real_face*.
- It is also important to remember to drop one dummy for KNN and logistic regression (but it is better to keep it for the decision tree).

In [6]:
# List of numerical features
numerical_features_ext = [
    'overall', 'potential', 'wage_eur', 'height_cm', 'weight_kg',
    'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys',
    'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
    'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance',
    'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
    'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure',
    'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle',
    'value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 
    'physic', 'age', 'height_weight_ratio'
]

# List of categorical ordinal features 
categorical_ordinal_features_ext = [
    'league_level', 'weak_foot', 'skill_moves', 'international_reputation'
]

# List of categorical nominal features
categorical_nominal_features_ext_all_dummies = [
    'club_jersey_number', 'preferred_foot_Right',	
    'preferred_foot_Left',
    'work_rate_High/High', 'work_rate_High/Low', 'work_rate_High/Medium',
    'work_rate_Low/High', 'work_rate_Low/Low', 'work_rate_Low/Medium',
    'work_rate_Medium/High', 'work_rate_Medium/Low',
    'work_rate_Medium/Medium',
]

categorical_nominal_features_ext = [
    'club_jersey_number',
    'preferred_foot_Left',
    'work_rate_High/Low', 'work_rate_High/Medium',
    'work_rate_Low/High', 'work_rate_Low/Low', 'work_rate_Low/Medium',
    'work_rate_Medium/High', 'work_rate_Medium/Low',
    'work_rate_Medium/Medium',
]


# Complete list of categorical features
categorical_features_all_dummies_ext = categorical_nominal_features_ext_all_dummies + categorical_ordinal_features_ext
categorical_features_ext = categorical_nominal_features_ext + categorical_ordinal_features_ext

#### Logistic regression

In [7]:
df_report_logreg_ext, best_params_logreg_ext, f1_w_logreg_ext = baseline_models(
    df = dfv1, numerical_features = numerical_features_ext, categorical_features = categorical_features_ext, 
    model = LogisticRegression, target = 'position_grouped', n_hyper_comb = 20, n_folds = 5, num_imputer = "median",
    goalkeeper_label = "G_GK"
    )

Fitting 5 folds for each of 20 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
df_report_logreg_ext

Unnamed: 0,A_LF,A_LS,A_LW,A_RF,A_RS,A_RW,A_ST,D_CB,D_LB,D_LCB,...,M_CM,M_LCM,M_LDM,M_LM,M_RCM,M_RDM,M_RM,accuracy,macro avg,weighted avg
precision,0.5,0.325301,0.339286,1.0,0.37931,0.397059,0.508844,0.5,0.642738,0.655012,...,0.2,0.323017,0.386667,0.319149,0.327181,0.306667,0.33515,0.5095,0.461432,0.497698
recall,0.09375,0.134328,0.114458,0.03125,0.109453,0.162651,0.873832,0.017964,0.831533,0.445325,...,0.013333,0.394799,0.144279,0.325203,0.460993,0.114428,0.333333,0.5095,0.334228,0.5095
f1-score,0.157895,0.190141,0.171171,0.060606,0.169884,0.230769,0.643164,0.034682,0.725047,0.530189,...,0.025,0.355319,0.210145,0.322148,0.382728,0.166667,0.334239,0.5095,0.326442,0.4676
support,32.0,201.0,166.0,32.0,201.0,166.0,428.0,167.0,463.0,631.0,...,75.0,423.0,201.0,369.0,423.0,201.0,369.0,0.5095,6895.0,6895.0


In [10]:
best_params_logreg_ext

{'logreg__estimator__C': 8.65042103546211,
 'logreg__estimator__class_weight': None}

In [9]:
f1_w_logreg_ext

0.36366735083524143

Logistic regression has performed better with all of the features.

#### Decision Tree

In [11]:
df_report_dt_ext, best_params_dt_ext, f1_w_dt_ext = baseline_models(
    df = dfv1, numerical_features = numerical_features_ext, categorical_features = categorical_features_ext, 
    model = DecisionTreeClassifier, target = 'position_grouped', n_hyper_comb = 100, n_folds = 5, num_imputer = "median",
    goalkeeper_label = "G_GK"
    )

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [12]:
df_report_dt_ext

Unnamed: 0,A_LF,A_LS,A_LW,A_RF,A_RS,A_RW,A_ST,D_CB,D_LB,D_LCB,...,M_CM,M_LCM,M_LDM,M_LM,M_RCM,M_RDM,M_RM,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,0.497674,0.0,0.407972,0.572254,...,0.0,0.236559,0.0,0.230263,0.226607,0.0,0.188259,0.406091,0.181915,0.326516
recall,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.75162,0.313788,...,0.0,0.104019,0.0,0.284553,0.475177,0.0,0.252033,0.406091,0.234804,0.406091
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.598322,0.0,0.528875,0.405322,...,0.0,0.144499,0.0,0.254545,0.30687,0.0,0.215527,0.406091,0.194631,0.3436
support,32.0,201.0,166.0,32.0,201.0,166.0,428.0,167.0,463.0,631.0,...,75.0,423.0,201.0,369.0,423.0,201.0,369.0,0.406091,6895.0,6895.0


In [13]:
best_params_dt_ext

{'dtc__class_weight': None,
 'dtc__max_depth': 13,
 'dtc__max_features': 55,
 'dtc__max_leaf_nodes': 1000,
 'dtc__min_samples_leaf': 0.027908784118733383,
 'dtc__min_samples_split': 0.05608362352155855}

In [14]:
f1_w_dt_ext

0.2651632261081637