In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.feature_selection import RFE
import joblib


### 0. Preparing DateTypes 

In [4]:
df = pd.read_csv('bundesliga_matches_final_show.csv')

In [7]:
pd.set_option('display.max_rows', None)
df.dtypes

date                                 object
time                                 object
comp                                 object
round                                object
day                                  object
venue                                object
result                               object
gf                                    int64
ga                                    int64
xg                                  float64
xga                                 float64
poss                                  int64
opponent_gf                           int64
opponent_ga                           int64
opponent_xg                         float64
opponent_xga                        float64
opponent_poss                         int64
attendance                            int64
captain                              object
opponent_captain                     object
home_team_formation                  object
away_team_formation                  object
referee                         

In [4]:
# Import necessary libraries
import pandas as pd

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')

# Combine 'date' and 'time' columns into a single datetime feature
df['datetime'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['time'])

# Drop the 'time' column if not needed anymore
df = df.drop(columns=['time'])
# Making new column 'Hour' 
df['hour'] = df['datetime'].dt.hour

# Convert categorical columns to category dtype for memory efficiency
categorical_columns = ['comp', 'round', 'day', 'venue', 'result', 'captain', 'opponent_captain',
                       'home_team_formation', 'away_team_formation', 'referee']

df[categorical_columns] = df[categorical_columns].astype('category')





In [5]:
df['home_team_formation'].head(3)

0    04.03.2003
1       4-2-3-1
2    05.03.2002
Name: home_team_formation, dtype: category
Categories (19, object): ['03.04.2003', '03.05.2002', '04.03.2003', '04.04.2002', ..., '4-2-3-1', '4-3-1-2', '4-3-2-1', '4-4-1-1']

In [6]:
import re

# Function to fix date-like strings back into football formations
def fix_formation(entry):
    # Check if the entry is a date-like format (xx.xx.xxxx)
    if re.match(r'^\d{2}\.\d{2}\.\d{4}$', str(entry)):
        # Extract relevant digits
        first_digit = int(entry[:2].lstrip('0'))  # First formation number (before the first dot)
        second_digit = int(entry[3:5].lstrip('0'))  # Second formation number (after the first dot)
        third_digit = int(entry[-1])   # Third formation number

        # Return the corrected formation in the form of x-x-x
        return f"{first_digit}-{second_digit}-{third_digit}"
    
    # Return the original entry if it's not a date-like string
    return entry

# Apply the function to fix 'home_team_formation' and 'away_team_formation'
df['home_team_formation'] = df['home_team_formation'].apply(fix_formation)
df['away_team_formation'] = df['away_team_formation'].apply(fix_formation)

# Check the cleaned columns
df[['home_team_formation', 'away_team_formation']].head()


Unnamed: 0,home_team_formation,away_team_formation
0,4-3-3,4-2-3-1
1,4-2-3-1,4-2-3-1
2,5-3-2,4-2-3-1
3,4-1-4-1,4-3-3
4,4-4-2,3-4-3


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 73 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   date                              1260 non-null   datetime64[ns]
 1   comp                              1260 non-null   category      
 2   round                             1260 non-null   category      
 3   day                               1260 non-null   category      
 4   venue                             1260 non-null   category      
 5   result                            1260 non-null   category      
 6   gf                                1260 non-null   int64         
 7   ga                                1260 non-null   int64         
 8   xg                                1259 non-null   float64       
 9   xga                               1259 non-null   float64       
 10  poss                              1260 non-null 

### 1. Building initial model
Without imputation, and with just encoding the target first to W/L/D  and then W/L&D

#### 1.0 First lets impute few missing values

In [8]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

# Display the columns with missing values and their count
missing_values

xg               1
xga              1
opponent_xg      1
opponent_xga     1
referee          9
dist             3
fk               1
opponent_dist    1
opponent_fk      1
dtype: int64

Since there are only few will use simple imputer and dont bother much about them

In [9]:


# Impute missing values for numerical columns using the mean
numerical_columns = ['xg', 'xga', 'opponent_xg', 'opponent_xga', 'dist', 'fk', 'opponent_dist', 'opponent_fk']
num_imputer = SimpleImputer(strategy='mean')
df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

# Impute missing values for categorical columns using the most frequent value
categorical_columns = ['referee']
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

# Check if missing values are handled
missing_values_after = df.isnull().sum()
print(missing_values_after)


date                                0
comp                                0
round                               0
day                                 0
venue                               0
result                              0
gf                                  0
ga                                  0
xg                                  0
xga                                 0
poss                                0
opponent_gf                         0
opponent_ga                         0
opponent_xg                         0
opponent_xga                        0
opponent_poss                       0
attendance                          0
captain                             0
opponent_captain                    0
home_team_formation                 0
away_team_formation                 0
referee                             0
sh                                  0
sot                                 0
dist                                0
fk                                  0
pk          

#### 1.1 Lets see how model predicts if he got 3 target variables

In [10]:
rf = RandomForestClassifier(n_estimators=66, min_samples_split=10, random_state=42)

# Encode the result column into numerical values
label_encoder = LabelEncoder()
df['result_encoded'] = label_encoder.fit_transform(df['result'])

# Split the dataset into training and test sets
train = df[df["date"] < '2024-01-01']
test = df[df["date"] > '2024-01-01']

# Update the list of numerical predictors
predictors = [
    "team_overall", "team_attack", "team_midfield", "team_defense", 
    "opponent_overall", "opponent_attack", "opponent_midfield", "opponent_defense",
    "gf_last_4_games", "ga_last_4_games", "xg_last_4_games", "xga_last_4_games", 
    "avg_points_last_4_games", "sh_last_4_games", "sot_last_4_games", "poss_last_4_games", 
    "opponent_gf_last_4_games", "opponent_ga_last_4_games", "opponent_xga_last_4_games", 
    "opponent_avg_points_last_4_games", "team_salary", "opponent_team_salary", "hour"
]

# Fit the model on the training set using 'result_encoded' as the target
rf.fit(train[predictors], train['result_encoded'])

# Predict on the test set
predictions = rf.predict(test[predictors])

accuracy = accuracy_score(test["result_encoded"], predictions)

accuracy

0.5025125628140703

In [11]:
# Decoding predictions back to W/D/L for better interpretation (I just wanted to test that and see how it works)
predictions_decoded = label_encoder.inverse_transform(predictions)
actual_results_decoded = label_encoder.inverse_transform(test['result_encoded'])

# Confusion matrix
conf_matrix = confusion_matrix(actual_results_decoded, predictions_decoded, labels=['W', 'D', 'L'])
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, F1-score)
class_report = classification_report(actual_results_decoded, predictions_decoded, labels=['W', 'D', 'L'])
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[63  4 13]
 [37  6 11]
 [29  5 31]]
Classification Report:
              precision    recall  f1-score   support

           W       0.49      0.79      0.60        80
           D       0.40      0.11      0.17        54
           L       0.56      0.48      0.52        65

    accuracy                           0.50       199
   macro avg       0.48      0.46      0.43       199
weighted avg       0.49      0.50      0.46       199



#### 1.2 Now only when we trying to predict the Win

In [12]:

# Convert 'result' to binary: 1 for Win, 0 for Lose/Draw
df['win_binary'] = df['result'].apply(lambda x: 1 if x == 'W' else 0)

# Split the dataset into training and test sets
train = df[df["date"] < '2024-01-01']
test = df[df["date"] > '2024-01-01']

# List of predictors (same as before, using numerical features)
predictors = [
    "team_overall", "team_attack", "team_midfield", "team_defense", 
    "opponent_overall", "opponent_attack", "opponent_midfield", "opponent_defense",
    "gf_last_4_games", "ga_last_4_games", "xg_last_4_games", "xga_last_4_games", 
    "avg_points_last_4_games", "sh_last_4_games", "sot_last_4_games", "poss_last_4_games", 
    "opponent_gf_last_4_games", "opponent_ga_last_4_games", "opponent_xga_last_4_games", 
    "opponent_avg_points_last_4_games", "team_salary", "opponent_team_salary", "hour"
]

# Initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=66, min_samples_split=10, random_state=42)

# Fit the model on the training set with the new binary target
rf.fit(train[predictors], train['win_binary'])

# Predict on the test set
predictions = rf.predict(test[predictors])

# Evaluate the model
accuracy = accuracy_score(test['win_binary'], predictions)
conf_matrix = confusion_matrix(test['win_binary'], predictions)
class_report = classification_report(test['win_binary'], predictions)

# Display results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.6532663316582915
Confusion Matrix:
[[82 37]
 [32 48]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.69      0.70       119
           1       0.56      0.60      0.58        80

    accuracy                           0.65       199
   macro avg       0.64      0.64      0.64       199
weighted avg       0.66      0.65      0.65       199



### 2. Preparing the pipeline with encoding for Models

In [48]:
# Define numerical and categorical columns
numerical_features = ["team_overall", "team_attack", "team_midfield", "team_defense", 
                        "opponent_overall", "opponent_attack", "opponent_midfield", "opponent_defense",
                        "gf_last_4_games", "ga_last_4_games", "xg_last_4_games", "xga_last_4_games", 
                        "avg_points_last_4_games", "sh_last_4_games", "sot_last_4_games", "poss_last_4_games", 
                        "opponent_gf_last_4_games", "opponent_ga_last_4_games", "opponent_xga_last_4_games", 
                        "opponent_avg_points_last_4_games", "team_salary", "opponent_team_salary", "hour"]

categorical_features = ["venue", "day", "home_team_formation", "away_team_formation", "referee"]

# Define the preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values for numerical features
                ('scaler', StandardScaler())  # Scale numerical features
            ]), numerical_features),
            
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical features
                ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
            ]), categorical_features)
        ])

# Split the dataset based on date
train = df[df["date"] < '2023-09-01']
test = df[df["date"] >= '2023-09-01']

# Extract the features (X) and target (y)
X_train = train[numerical_features + categorical_features]
y_train = train['result_encoded']
X_test = test[numerical_features + categorical_features]
y_test = test['result_encoded']

# Pipeline ready for model integration
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', None)  # Placeholder for the classifier
])

### 3. Testing Models (All parameters were little bit tuned before so its not the whole procces and i focused on time constrain of my project and capability of my PC)

In [14]:
df.head(5)

Unnamed: 0,date,comp,round,day,venue,result,gf,ga,xg,xga,...,opponent_dist_last_4_games,opponent_fk_last_4_games,opponent_pk_last_4_games,opponent_pkatt_last_4_games,opponent_xg_last_4_games,opponent_poss_last_4_games,datetime,hour,result_encoded,win_binary
0,2020-09-26,Bundesliga,Matchweek 2,Sat,Home,W,1,0,0.2,1.4,...,12.0,0.0,0.0,0.0,2.7,49.0,2020-09-26 15:30:00,15,2,1
1,2020-10-17,Bundesliga,Matchweek 4,Sat,Home,L,1,4,1.4,3.1,...,16.966667,1.333333,0.666667,0.666667,2.966667,66.666667,2020-10-17 18:30:00,18,1,0
2,2020-10-31,Bundesliga,Matchweek 6,Sat,Home,L,0,2,0.3,2.4,...,16.2,0.5,0.0,0.0,1.925,68.5,2020-10-31 15:30:00,15,1,0
3,2020-11-21,Bundesliga,Matchweek 8,Sat,Home,L,1,2,0.0,1.2,...,19.775,0.0,0.25,0.25,1.35,61.5,2020-11-21 15:30:00,15,1,0
4,2020-12-05,Bundesliga,Matchweek 10,Sat,Home,W,2,1,1.2,2.8,...,17.25,0.25,0.5,0.5,2.3,37.0,2020-12-05 15:30:00,15,2,1


#### 3.1 XGB

In [15]:


# Parameters
param_grid = {
    'classifier__n_estimators': [100],  # Fix based on the best result
    'classifier__learning_rate': [0.01, 0.02],  # Slightly explore around the best result
    'classifier__max_depth': [3, 4, 5],  # Explore depths around the best result
    'classifier__subsample': [0.7],  # Fix based on the best result
    'classifier__colsample_bytree': [0.7], # Fix based on the best result  
    'classifier__gamma': [0.3]  # Fix based on the best result
}

# Replace the classifier in the pipeline with XGBoost configured to use GPU
pipeline.steps[-1] = ('classifier', XGBClassifier(device='cuda', n_jobs=-1))

# Set up GridSearchCV with the updated parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV model
grid_search.fit(X_train, y_train)

# Get predictions on the test set
y_pred = grid_search.predict(X_test)

# Confusion matrix using the encoded values for Draw (0), Loss (1), Win (2)
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])

# Calculate per-class accuracy (Draw, Loss, Win)
per_class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
draw_accuracy = per_class_accuracy[0]  # Accuracy for 'Draw' (0)
loss_accuracy = per_class_accuracy[1]  # Accuracy for 'Loss' (1)
win_accuracy = per_class_accuracy[2]  # Accuracy for 'Win' (2)

# Best parameters and score from GridSearchCV
best_params_grid = grid_search.best_params_
print(f"Best parameters from GridSearchCV: {best_params_grid}")
print(f"Best score from GridSearchCV: {grid_search.best_score_:.4f}")

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print(f"Test set accuracy: {test_accuracy:.4f}")

# Store best results including accuracy for Win, Draw, Loss, and test accuracy
results_df = pd.DataFrame(columns=["Model", "Best Score", "Best Params", "Win Accuracy", "Draw Accuracy", "Loss Accuracy", "Test Accuracy"])

# Use pd.concat instead of append
new_row = pd.DataFrame({
    "Model": ["XGBoost"],
    "Best Score": [grid_search.best_score_],
    "Best Params": [best_params_grid],
    "Win Accuracy": [win_accuracy],
    "Draw Accuracy": [draw_accuracy],
    "Loss Accuracy": [loss_accuracy],
    "Test Accuracy": [test_accuracy]
})

# Concatenate the new row with the results DataFrame
results_df = pd.concat([results_df, new_row], ignore_index=True)

# Save the results to a CSV
results_df.to_csv("xgboost_tuning_results_with_accuracy.csv", index=False)

# Display the final results DataFrame
results_df


Best parameters from GridSearchCV: {'classifier__colsample_bytree': 0.7, 'classifier__gamma': 0.3, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.7}
Best score from GridSearchCV: 0.5021
Test set accuracy: 0.5525


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  results_df = pd.concat([results_df, new_row], ignore_index=True)


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Draw Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost,0.502137,"{'classifier__colsample_bytree': 0.7, 'classif...",0.864286,0.047059,0.545455,0.552469


Im having low draw accuracy

### 3.2 Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import pandas as pd

# Define the broad parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],  
    'classifier__max_depth': [10, 20, None],  
    'classifier__min_samples_split': [2, 5, 10],  
    'classifier__min_samples_leaf': [1, 2, 4],  
    'classifier__bootstrap': [True, False] 
}

# Replace the classifier in the pipeline with Random Forest
pipeline.steps[-1] = ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))

# Set up GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(pipeline, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV model for Random Forest
grid_search_rf.fit(X_train, y_train)

# Get predictions on the test set for Random Forest
y_pred_rf = grid_search_rf.predict(X_test)

# Confusion matrix using the encoded values for Draw (0), Loss (1), Win (2)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1, 2])

# Calculate per-class accuracy (Draw, Loss, Win)
per_class_accuracy_rf = conf_matrix_rf.diagonal() / conf_matrix_rf.sum(axis=1)
draw_accuracy_rf = per_class_accuracy_rf[0]  # Accuracy for 'Draw' (0)
loss_accuracy_rf = per_class_accuracy_rf[1]  # Accuracy for 'Loss' (1)
win_accuracy_rf = per_class_accuracy_rf[2]  # Accuracy for 'Win' (2)

# Best parameters and score from GridSearchCV for Random Forest
best_params_rf = grid_search_rf.best_params_
print(f"Best parameters from Random Forest GridSearchCV: {best_params_rf}")
print(f"Best score from Random Forest GridSearchCV: {grid_search_rf.best_score_:.4f}")

# Evaluate the model on the test set
test_accuracy_rf = grid_search_rf.score(X_test, y_test)
print(f"Test set accuracy for Random Forest: {test_accuracy_rf:.4f}")

# Append Random Forest results to the existing results_df
new_row_rf = pd.DataFrame({
    "Model": ["RandomForest"],
    "Best Score": [grid_search_rf.best_score_],
    "Best Params": [best_params_rf],
    "Win Accuracy": [win_accuracy_rf],
    "Draw Accuracy": [draw_accuracy_rf],
    "Loss Accuracy": [loss_accuracy_rf],
    "Test Accuracy": [test_accuracy_rf]
})

# Concatenate the new row with the existing results DataFrame
results_df = pd.concat([results_df, new_row_rf], ignore_index=True)

# Save the updated results to a CSV
results_df.to_csv("tuning_results_with_accuracy.csv", index=False)

# Display the final results DataFrame with Random Forest results included
results_df


Best parameters from Random Forest GridSearchCV: {'classifier__bootstrap': True, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Best score from Random Forest GridSearchCV: 0.5064
Test set accuracy for Random Forest: 0.5556


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Draw Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost,0.502137,"{'classifier__colsample_bytree': 0.7, 'classif...",0.864286,0.047059,0.545455,0.552469
1,RandomForest,0.50641,"{'classifier__bootstrap': True, 'classifier__m...",0.885714,0.023529,0.545455,0.555556


Since i found out that models struggle with predicting draws i decided to investigate it

### 3.2.1 Why Draws Are Hard to Predict: (I elaborate a little bit on that on my presentation)

1. **Low Frequency**:
   - Draws occur much less frequently than wins and losses, making it difficult for models to learn patterns associated with draws. In football leagues, draws typically represent only around 25-27% of outcomes. This class imbalance biases models toward predicting wins or losses.

2. **Unpredictable Factors**:
   - Draws are often the result of unpredictable factors such as team tactics, missed opportunities, or random events during the game. These small, nuanced details are hard for models to capture effectively, as draws are influenced by both defensive and offensive strategies.

3. **Model Limitations**:
   - Traditional machine learning models, as well as statistical approaches, have struggled to predict draws accurately. They tend to focus on more distinct patterns related to wins and losses, resulting in lower accuracy for draws. The presence of multiple potential outcomes (win, draw, loss) further complicates predictions.

### Why Shift to **Win/Not Win** Predictions:

1. **Simplified Problem**:
   - By combining draws and losses into a single class (Not Win), the model's task becomes simpler: it only needs to predict whether a team wins or doesn’t win. This removes the complexity of distinguishing between draws and losses, leading to better predictive performance.

2. **Addressing Class Imbalance**:
   - Grouping draws with losses creates a more balanced dataset, reducing the issue of class imbalance. This allows the model to learn patterns related to both outcomes more effectively, without being skewed by the lower occurrence of draws.

3. **Improved Accuracy**:
   - Focusing on Win/Not Win predictions enables the model to provide clearer and more actionable insights, as predicting wins is often the key outcome in real-world scenarios (e.g., betting or match strategy). The simplified classification task leads to better overall accuracy.

#### Conclusion:
Due to the inherent difficulties of predicting draws and the advantages of a simplified binary classification task, shifting to **Win/Not Win** predictions will improve model performance and provide more meaningful predictions.





In [51]:
df.head(15)

Unnamed: 0,date,comp,round,day,venue,result,gf,ga,xg,xga,...,opponent_dist_last_4_games,opponent_fk_last_4_games,opponent_pk_last_4_games,opponent_pkatt_last_4_games,opponent_xg_last_4_games,opponent_poss_last_4_games,datetime,hour,result_encoded,win_binary
0,2020-09-26,Bundesliga,Matchweek 2,Sat,Home,W,1,0,0.2,1.4,...,12.0,0.0,0.0,0.0,2.7,49.0,2020-09-26 15:30:00,15,2,1
1,2020-10-17,Bundesliga,Matchweek 4,Sat,Home,L,1,4,1.4,3.1,...,16.966667,1.333333,0.666667,0.666667,2.966667,66.666667,2020-10-17 18:30:00,18,1,0
2,2020-10-31,Bundesliga,Matchweek 6,Sat,Home,L,0,2,0.3,2.4,...,16.2,0.5,0.0,0.0,1.925,68.5,2020-10-31 15:30:00,15,1,0
3,2020-11-21,Bundesliga,Matchweek 8,Sat,Home,L,1,2,0.0,1.2,...,19.775,0.0,0.25,0.25,1.35,61.5,2020-11-21 15:30:00,15,1,0
4,2020-12-05,Bundesliga,Matchweek 10,Sat,Home,W,2,1,1.2,2.8,...,17.25,0.25,0.5,0.5,2.3,37.0,2020-12-05 15:30:00,15,2,1
5,2020-12-16,Bundesliga,Matchweek 12,Wed,Home,L,0,1,0.6,0.4,...,18.75,0.25,0.0,0.0,0.9,48.0,2020-12-16 20:30:00,20,1,0
6,2021-01-02,Bundesliga,Matchweek 14,Sat,Home,L,0,1,0.9,2.5,...,19.9,0.75,0.5,0.5,1.625,56.0,2021-01-02 15:30:00,15,1,0
7,2021-01-10,Bundesliga,Matchweek 15,Sun,Home,W,1,0,0.7,0.6,...,17.325,0.25,0.0,0.0,1.3,57.0,2021-01-10 18:00:00,18,2,1
8,2021-01-20,Bundesliga,Matchweek 17,Wed,Home,W,3,0,1.8,1.4,...,15.775,0.0,0.5,0.5,1.9,53.5,2021-01-20 20:30:00,20,2,1
9,2021-01-23,Bundesliga,Matchweek 18,Sat,Home,L,1,5,0.5,2.4,...,17.1,0.5,0.5,0.5,1.775,55.5,2021-01-23 15:30:00,15,1,0


### 4. Testing Models just on binary classification Win/Not Win

In [49]:

# Extract the features (X) and target (y) for binary classification
X_train_binary = train[numerical_features + categorical_features]
y_train_binary = train['win_binary']  # Ensure 'win_binary' column exists in your DataFrame
X_test_binary = test[numerical_features + categorical_features]
y_test_binary = test['win_binary']

#### 4.1 XGB

In [55]:
param_grid_xgb = {
    'classifier__n_estimators': [100, 150],  # Number of trees
    'classifier__max_depth': [3, 6],  # Maximum depth of trees
    'classifier__learning_rate': [0.01, 0.1],  # Learning rate
    'classifier__subsample': [0.8, 1.0],  # Subsample ratio
    'classifier__colsample_bytree': [0.8, 1.0],  # Subsample ratio of columns for each tree
    'classifier__gamma': [0, 0.1],  # Minimum loss reduction to make a split
    'classifier__min_child_weight': [1, 3],  # Minimum sum of instance weight needed in a child
    'classifier__reg_alpha': [0, 0.1],  # L1 regularization term
    'classifier__reg_lambda': [0.1, 1],  # L2 regularization term
    }


# Replace the classifier in the pipeline with XGBoost
pipeline.steps[-1] = ('classifier', xgb.XGBClassifier(
    tree_method="hist", 
    use_label_encoder=False, 
    eval_metric='logloss',
    random_state=1, 
    scale_pos_weight=1.23  # Apply class weighting
))

# Set up GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(pipeline, param_grid_xgb, cv=10, scoring='accuracy', n_jobs=-1)

# Fit the XGBoost model
grid_search_xgb.fit(X_train_binary, y_train_binary)

# Save the best model to a file using joblib
joblib.dump(grid_search_xgb.best_estimator_, 'best_xgb_model.pkl')

# Get predictions on the test set
y_pred_xgb = grid_search_xgb.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_xgb = confusion_matrix(y_test_binary, y_pred_xgb, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_xgb = conf_matrix_xgb[0, 0] / conf_matrix_xgb[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_xgb = conf_matrix_xgb[1, 1] / conf_matrix_xgb[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_xgb = grid_search_xgb.best_params_
print(f"Best parameters from GridSearchCV (XGBoost): {best_params_grid_xgb}")
print(f"Best score from GridSearchCV (XGBoost): {grid_search_xgb.best_score_:.4f}")

# Evaluate the XGBoost model on the test set
test_accuracy_xgb = grid_search_xgb.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (XGBoost): {test_accuracy_xgb:.4f}")

# Initialize results_df_win_loss if it doesn't exist
if 'results_df_win_notwin' not in locals():
    results_df_win_notwin = pd.DataFrame(columns=["Model", "Best Score", "Best Params", "Win Accuracy", "Loss Accuracy", "Test Accuracy"])

# Check if the current XGBoost test accuracy is better than the previous best for XGBoost
if "XGBoost - Win/Loss" not in results_df_win_notwin["Model"].values or test_accuracy_xgb > results_df_win_notwin[results_df_win_notwin["Model"] == "XGBoost - Win/Loss"]["Test Accuracy"].values[0]:

    # Append the XGBoost results to the existing results DataFrame
    new_row_xgb = pd.DataFrame({
        "Model": ["XGBoost - Win/Loss"],
        "Best Score": [grid_search_xgb.best_score_],
        "Best Params": [best_params_grid_xgb],
        "Win Accuracy": [win_accuracy_xgb],
        "Loss Accuracy": [loss_accuracy_xgb],
        "Test Accuracy": [test_accuracy_xgb]
    })

    # Append the new row with XGBoost results to the existing DataFrame
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_xgb], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    results_df_win_notwin.to_csv("win_notwin_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin


Best parameters from GridSearchCV (XGBoost): {'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 6, 'classifier__min_child_weight': 3, 'classifier__n_estimators': 100, 'classifier__reg_alpha': 0, 'classifier__reg_lambda': 1, 'classifier__subsample': 0.8}
Best score from GridSearchCV (XGBoost): 0.5834
Test set accuracy (XGBoost): 0.6667


Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667


#### 4.2 Random Forest

In [56]:
# Define a broader parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300, 400],  # Number of trees
    'classifier__max_depth': [10, 15, 20],  # Maximum depth of trees
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'classifier__max_features': ['sqrt', 'log2'],  # Only valid options for max_features
    'classifier__bootstrap': [True, False]  # Whether bootstrap samples are used
}

# Replace the classifier in the pipeline with RandomForestClassifier
pipeline.steps[-1] = ('classifier', RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=1))

# Set up GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(pipeline, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the Random Forest model
grid_search_rf.fit(X_train_binary, y_train_binary)

# Save the best Random Forest model to a file using joblib
joblib.dump(grid_search_rf.best_estimator_, 'best_rf_model.pkl')

# Get predictions on the test set
y_pred_rf = grid_search_rf.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_rf = confusion_matrix(y_test_binary, y_pred_rf, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_rf = conf_matrix_rf[0, 0] / conf_matrix_rf[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_rf = conf_matrix_rf[1, 1] / conf_matrix_rf[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_rf = grid_search_rf.best_params_
print(f"Best parameters from GridSearchCV (Random Forest): {best_params_grid_rf}")
print(f"Best score from GridSearchCV (Random Forest): {grid_search_rf.best_score_:.4f}")

# Evaluate the Random Forest model on the test set
test_accuracy_rf = grid_search_rf.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (Random Forest): {test_accuracy_rf:.4f}")

# Check if the current Random Forest test accuracy is better than the previous best for Random Forest
if "Random Forest - Win/Loss" not in results_df_win_notwin["Model"].values or test_accuracy_rf > results_df_win_notwin[results_df_win_notwin["Model"] == "Random Forest - Win/Loss"]["Test Accuracy"].values[0]:

    # Append the Random Forest results to the existing results DataFrame
    new_row_rf = pd.DataFrame({
        "Model": ["Random Forest - Win/Loss"],
        "Best Score": [grid_search_rf.best_score_],
        "Best Params": [best_params_grid_rf],
        "Win Accuracy": [win_accuracy_rf],
        "Loss Accuracy": [loss_accuracy_rf],
        "Test Accuracy": [test_accuracy_rf]
    })

    # Append the new row with Random Forest results to the existing DataFrame
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_rf], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    results_df_win_notwin.to_csv("win_loss_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin




Best parameters from GridSearchCV (Random Forest): {'classifier__bootstrap': True, 'classifier__max_depth': 20, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best score from GridSearchCV (Random Forest): 0.6410
Test set accuracy (Random Forest): 0.6481


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148


#### 4.3 Logistic Regression

In [57]:


param_grid_lr = {
    'classifier__C': [0.01],  
    'classifier__penalty': ['l2'],
    'classifier__solver': ['liblinear'],  
    'classifier__max_iter': [100], 
    'classifier__l1_ratio': [0.1, 0.5, 0.9], 
    'classifier__class_weight': [None, 'balanced']
}

# Replace the classifier in the pipeline with LogisticRegression
pipeline.steps[-1] = ('classifier', LogisticRegression(n_jobs=-1, random_state=1))

# Set up GridSearchCV for Logistic Regression
grid_search_lr = GridSearchCV(pipeline, param_grid_lr, cv=10, scoring='accuracy', n_jobs=-1, error_score='raise')

# Fit the Logistic Regression model
grid_search_lr.fit(X_train_binary, y_train_binary)

# Save the best Logistic Regression model to a file using joblib
joblib.dump(grid_search_lr.best_estimator_, 'best_lr_model.pkl')

# Get predictions on the test set
y_pred_lr = grid_search_lr.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_lr = confusion_matrix(y_test_binary, y_pred_lr, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_lr = conf_matrix_lr[0, 0] / conf_matrix_lr[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_lr = conf_matrix_lr[1, 1] / conf_matrix_lr[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_lr = grid_search_lr.best_params_
print(f"Best parameters from GridSearchCV (Logistic Regression): {best_params_grid_lr}")
print(f"Best score from GridSearchCV (Logistic Regression): {grid_search_lr.best_score_:.4f}")

# Evaluate the Logistic Regression model on the test set
test_accuracy_lr = grid_search_lr.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (Logistic Regression): {test_accuracy_lr:.4f}")

# Check if the current Logistic Regression test accuracy is better than the previous best for Logistic Regression
if "Logistic Regression - Win/Loss" not in results_df_win_notwin["Model"].values or test_accuracy_lr > results_df_win_notwin[results_df_win_notwin["Model"] == "Logistic Regression - Win/Loss"]["Test Accuracy"].values[0]:
    
    # Append the Logistic Regression results to the existing results DataFrame
    new_row_lr = pd.DataFrame({
        "Model": ["Logistic Regression - Win/Loss"],
        "Best Score": [grid_search_lr.best_score_],
        "Best Params": [best_params_grid_lr],
        "Win Accuracy": [win_accuracy_lr],
        "Loss Accuracy": [loss_accuracy_lr],
        "Test Accuracy": [test_accuracy_lr]
    })

    # Append the new row with Logistic Regression results to the existing DataFrame
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_lr], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    results_df_win_notwin.to_csv("win_loss_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin




Best parameters from GridSearchCV (Logistic Regression): {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best score from GridSearchCV (Logistic Regression): 0.6442
Test set accuracy (Logistic Regression): 0.6790




Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148
3,Logistic Regression - Win/Loss,0.644166,"{'classifier__C': 0.01, 'classifier__class_wei...",0.6,0.73913,0.679012


#### 4.4 K-nearest Neighbors

In [58]:



param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 15, 19, 25, 27],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan'],  
    'classifier__p': [1, 2]  
}

# Replace the classifier in the pipeline with KNeighborsClassifier
pipeline.steps[-1] = ('classifier', KNeighborsClassifier(n_jobs=-1))

# Set up GridSearchCV for K-Nearest Neighbors
grid_search_knn = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the KNN model
grid_search_knn.fit(X_train_binary, y_train_binary)

# Save the best KNN model to a file using joblib
joblib.dump(grid_search_knn.best_estimator_, 'best_knn_model.pkl')

# Get predictions on the test set
y_pred_knn = grid_search_knn.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_knn = confusion_matrix(y_test_binary, y_pred_knn, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_knn = conf_matrix_knn[0, 0] / conf_matrix_knn[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_knn = conf_matrix_knn[1, 1] / conf_matrix_knn[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_knn = grid_search_knn.best_params_
print(f"Best parameters from GridSearchCV (KNN): {best_params_grid_knn}")
print(f"Best score from GridSearchCV (KNN): {grid_search_knn.best_score_:.4f}")

# Evaluate the KNN model on the test set
test_accuracy_knn = grid_search_knn.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (KNN): {test_accuracy_knn:.4f}")

# Check if KNN model results already exist and compare test accuracies
if "K-Nearest Neighbors - Win/Loss" not in results_df_win_notwin["Model"].values:
    # No previous KNN entry exists, so add it
    new_row_knn = pd.DataFrame({
        "Model": ["K-Nearest Neighbors - Win/Loss"],
        "Best Score": [grid_search_knn.best_score_],
        "Best Params": [best_params_grid_knn],
        "Win Accuracy": [win_accuracy_knn],
        "Loss Accuracy": [loss_accuracy_knn],
        "Test Accuracy": [test_accuracy_knn]
    })
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_knn], ignore_index=True)

else:
    # Previous KNN entry exists, compare and replace if new test accuracy is better
    previous_test_accuracy = results_df_win_notwin[results_df_win_notwin["Model"] == "K-Nearest Neighbors - Win/Loss"]["Test Accuracy"].values[0]
    
    if test_accuracy_knn > previous_test_accuracy:
        # Remove the old KNN entry
        results_df_win_notwin = results_df_win_notwin[results_df_win_notwin["Model"] != "K-Nearest Neighbors - Win/Loss"]
        
        # Append the new KNN results to the DataFrame
        new_row_knn = pd.DataFrame({
            "Model": ["K-Nearest Neighbors - Win/Loss"],
            "Best Score": [grid_search_knn.best_score_],
            "Best Params": [best_params_grid_knn],
            "Win Accuracy": [win_accuracy_knn],
            "Loss Accuracy": [loss_accuracy_knn],
            "Test Accuracy": [test_accuracy_knn]
        })
        results_df_win_loss = pd.concat([results_df_win_notwin, new_row_knn], ignore_index=True)

# Save the updated DataFrame to the CSV file
results_df_win_notwin.to_csv("win_loss_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin



Best parameters from GridSearchCV (KNN): {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 25, 'classifier__p': 1, 'classifier__weights': 'distance'}
Best score from GridSearchCV (KNN): 0.6282
Test set accuracy (KNN): 0.6389


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148
3,Logistic Regression - Win/Loss,0.644166,"{'classifier__C': 0.01, 'classifier__class_wei...",0.6,0.73913,0.679012
4,K-Nearest Neighbors - Win/Loss,0.628189,"{'classifier__metric': 'manhattan', 'classifie...",0.564286,0.695652,0.638889


#### 4.5 LGBM

In [59]:
# Further narrowed parameter grid for LGBM for faster execution (~a few minutes)
param_grid_lgbm = {
    'classifier__n_estimators': [150, 200],  # Close to 200
    'classifier__max_depth': [10, 15],  # Focus around 15
    'classifier__learning_rate': [0.01],  # Fix at 0.01 since it's already a good hit
    'classifier__num_leaves': [31],  # Fix at 31 
    'classifier__subsample': [0.8],  # Fix at 0.8
    'classifier__colsample_bytree': [0.8],  # Fix at 0.8
    'classifier__min_child_samples': [30],  # Fix at 30 
    'classifier__reg_alpha': [0.5],  # Fix at 0.5 
    'classifier__reg_lambda': [0.5]  # Fix at 0.5 
}

# Replace the classifier in the pipeline with LGBMClassifier
pipeline.steps[-1] = ('classifier', LGBMClassifier(n_jobs=-1, random_state=42))

# Set up GridSearchCV for LGBM
grid_search_lgbm = GridSearchCV(pipeline, param_grid_lgbm, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the LGBM model
grid_search_lgbm.fit(X_train_binary, y_train_binary)

# Save the best LGBM model to a file using joblib
joblib.dump(grid_search_lgbm.best_estimator_, 'best_lgbm_model.pkl')

# Get predictions on the test set
y_pred_lgbm = grid_search_lgbm.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_lgbm = confusion_matrix(y_test_binary, y_pred_lgbm, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_lgbm = conf_matrix_lgbm[0, 0] / conf_matrix_lgbm[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_lgbm = conf_matrix_lgbm[1, 1] / conf_matrix_lgbm[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_lgbm = grid_search_lgbm.best_params_
print(f"Best parameters from GridSearchCV (LGBM): {best_params_grid_lgbm}")
print(f"Best score from GridSearchCV (LGBM): {grid_search_lgbm.best_score_:.4f}")

# Evaluate the LGBM model on the test set
test_accuracy_lgbm = grid_search_lgbm.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (LGBM): {test_accuracy_lgbm:.4f}")

# Check if the current LGBM test accuracy is better than the previous best for LGBM
if "LGBM - Win/Loss" not in results_df_win_notwin["Model"].values:
    # No previous LGBM entry exists, so add it
    new_row_lgbm = pd.DataFrame({
        "Model": ["LGBM - Win/Loss"],
        "Best Score": [grid_search_lgbm.best_score_],
        "Best Params": [best_params_grid_lgbm],
        "Win Accuracy": [win_accuracy_lgbm],
        "Loss Accuracy": [loss_accuracy_lgbm],
        "Test Accuracy": [test_accuracy_lgbm]
    })
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_lgbm], ignore_index=True)

else:
    # Previous LGBM entry exists, compare and replace if new test accuracy is better
    previous_test_accuracy = results_df_win_notwin[results_df_win_notwin["Model"] == "LGBM - Win/Loss"]["Test Accuracy"].values[0]
    
    if test_accuracy_lgbm > previous_test_accuracy:
        # Remove the old LGBM entry
        results_df_win_notwin = results_df_win_notwin[results_df_win_notwin["Model"] != "LGBM - Win/Loss"]
        
        # Append the new LGBM results to the DataFrame
        new_row_lgbm = pd.DataFrame({
            "Model": ["LGBM - Win/Loss"],
            "Best Score": [grid_search_lgbm.best_score_],
            "Best Params": [best_params_grid_lgbm],
            "Win Accuracy": [win_accuracy_lgbm],
            "Loss Accuracy": [loss_accuracy_lgbm],
            "Test Accuracy": [test_accuracy_lgbm]
        })
        results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_lgbm], ignore_index=True)

# Save the updated DataFrame to the CSV file
results_df_win_notwin.to_csv("win_loss_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin


[LightGBM] [Info] Number of positive: 426, number of negative: 510
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 919
[LightGBM] [Info] Number of data points in the train set: 936, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.455128 -> initscore=-0.179971
[LightGBM] [Info] Start training from score -0.179971
Best parameters from GridSearchCV (LGBM): {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__min_child_samples': 30, 'classifier__n_estimators': 200, 'classifier__num_leaves': 31, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__subsample': 0.8}
Best score from GridSearchCV (LGBM): 0.6143
Test set accuracy (LGBM): 0.6605


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148
3,Logistic Regression - Win/Loss,0.644166,"{'classifier__C': 0.01, 'classifier__class_wei...",0.6,0.73913,0.679012
4,K-Nearest Neighbors - Win/Loss,0.628189,"{'classifier__metric': 'manhattan', 'classifie...",0.564286,0.695652,0.638889
5,LGBM - Win/Loss,0.614316,"{'classifier__colsample_bytree': 0.8, 'classif...",0.614286,0.695652,0.660494


#### 4.6 AdaBoost

In [60]:
import joblib

# Define a parameter grid for AdaBoost to run for ~30 minutes
param_grid_adaboost = {
    'classifier__n_estimators': [50, 100, 200, 500],  # Number of trees
    'classifier__learning_rate': [0.001, 0.01, 0.1, 1.0],  # Learning rate
    'classifier__algorithm': ['SAMME', 'SAMME.R']  # Boosting algorithms
}

# Replace the classifier in the pipeline with AdaBoostClassifier
pipeline.steps[-1] = ('classifier', AdaBoostClassifier(random_state=42))

# Set up GridSearchCV for AdaBoost
grid_search_adaboost = GridSearchCV(pipeline, param_grid_adaboost, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the AdaBoost model
grid_search_adaboost.fit(X_train_binary, y_train_binary)

# Save the best AdaBoost model to a file using joblib
joblib.dump(grid_search_adaboost.best_estimator_, 'best_adaboost_model.pkl')

# Get predictions on the test set
y_pred_adaboost = grid_search_adaboost.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_adaboost = confusion_matrix(y_test_binary, y_pred_adaboost, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_adaboost = conf_matrix_adaboost[0, 0] / conf_matrix_adaboost[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_adaboost = conf_matrix_adaboost[1, 1] / conf_matrix_adaboost[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_adaboost = grid_search_adaboost.best_params_
print(f"Best parameters from GridSearchCV (AdaBoost): {best_params_grid_adaboost}")
print(f"Best score from GridSearchCV (AdaBoost): {grid_search_adaboost.best_score_:.4f}")

# Evaluate the AdaBoost model on the test set
test_accuracy_adaboost = grid_search_adaboost.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (AdaBoost): {test_accuracy_adaboost:.4f}")

# Initialize results_df_win_notwin if it doesn't exist
if 'results_df_win_notwin' not in locals():
    results_df_win_notwin = pd.DataFrame(columns=["Model", "Best Score", "Best Params", "Win Accuracy", "Loss Accuracy", "Test Accuracy"])

# Check if the current AdaBoost test accuracy is better than the previous best for AdaBoost
if "AdaBoost - Win/Loss" not in results_df_win_notwin["Model"].values or test_accuracy_adaboost > results_df_win_notwin[results_df_win_notwin["Model"] == "AdaBoost - Win/Loss"]["Test Accuracy"].values[0]:

    # Append the AdaBoost results to the existing results DataFrame
    new_row_adaboost = pd.DataFrame({
        "Model": ["AdaBoost - Win/Loss"],
        "Best Score": [grid_search_adaboost.best_score_],
        "Best Params": [best_params_grid_adaboost],
        "Win Accuracy": [win_accuracy_adaboost],
        "Loss Accuracy": [loss_accuracy_adaboost],
        "Test Accuracy": [test_accuracy_adaboost]
    })

    # Append the new row with AdaBoost results to the existing DataFrame
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_adaboost], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    results_df_win_notwin.to_csv("win_loss_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin




Best parameters from GridSearchCV (AdaBoost): {'classifier__algorithm': 'SAMME.R', 'classifier__learning_rate': 0.01, 'classifier__n_estimators': 200}
Best score from GridSearchCV (AdaBoost): 0.6335
Test set accuracy (AdaBoost): 0.6759


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148
3,Logistic Regression - Win/Loss,0.644166,"{'classifier__C': 0.01, 'classifier__class_wei...",0.6,0.73913,0.679012
4,K-Nearest Neighbors - Win/Loss,0.628189,"{'classifier__metric': 'manhattan', 'classifie...",0.564286,0.695652,0.638889
5,LGBM - Win/Loss,0.614316,"{'classifier__colsample_bytree': 0.8, 'classif...",0.614286,0.695652,0.660494
6,AdaBoost - Win/Loss,0.633513,"{'classifier__algorithm': 'SAMME.R', 'classifi...",0.457143,0.842391,0.675926


#### 4.7 Gradient Boosting

In [66]:




param_grid_gb = {
    'classifier__n_estimators': [100, 150, 200],  # Number of boosting stages
    'classifier__max_depth': [3, 6, 8],  # Maximum depth of individual trees
    'classifier__learning_rate': [0.01, 0.1],  # Shrinks contribution of each tree
    'classifier__subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting individual base learners
    'classifier__min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'classifier__min_samples_leaf': [1, 3],  # Minimum number of samples required to be at a leaf node
    'classifier__max_features': ['sqrt', 'log2'],  # The number of features to consider when looking for the best split
}

# Replace the classifier in the pipeline with GradientBoostingClassifier
pipeline.steps[-1] = ('classifier', GradientBoostingClassifier(random_state=1))

# Set up GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(pipeline, param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the Gradient Boosting model
grid_search_gb.fit(X_train_binary, y_train_binary)

# Save the best model to a file using joblib
joblib.dump(grid_search_gb.best_estimator_, 'best_gb_model.pkl')

# Get predictions on the test set
y_pred_gb = grid_search_gb.predict(X_test_binary)

# Confusion matrix for binary classification (1 = Win, 0 = Loss)
conf_matrix_gb = confusion_matrix(y_test_binary, y_pred_gb, labels=[0, 1])

# Calculate accuracy for Win and Loss classes
loss_accuracy_gb = conf_matrix_gb[0, 0] / conf_matrix_gb[0].sum()  # Accuracy for 'Loss' (0)
win_accuracy_gb = conf_matrix_gb[1, 1] / conf_matrix_gb[1].sum()  # Accuracy for 'Win' (1)

# Best parameters and score from GridSearchCV
best_params_grid_gb = grid_search_gb.best_params_
print(f"Best parameters from GridSearchCV (Gradient Boosting): {best_params_grid_gb}")
print(f"Best score from GridSearchCV (Gradient Boosting): {grid_search_gb.best_score_:.4f}")

# Evaluate the Gradient Boosting model on the test set
test_accuracy_gb = grid_search_gb.score(X_test_binary, y_test_binary)
print(f"Test set accuracy (Gradient Boosting): {test_accuracy_gb:.4f}")

# Initialize results_df_win_loss if it doesn't exist
if 'results_df_win_notwin' not in locals():
    results_df_win_notwin = pd.DataFrame(columns=["Model", "Best Score", "Best Params", "Win Accuracy", "Loss Accuracy", "Test Accuracy"])

# Check if the current Gradient Boosting test accuracy is better than the previous best for Gradient Boosting
if "Gradient Boosting - Win/Loss" not in results_df_win_notwin["Model"].values or test_accuracy_gb > results_df_win_notwin[results_df_win_notwin["Model"] == "Gradient Boosting - Win/Loss"]["Test Accuracy"].values[0]:

    # Append the Gradient Boosting results to the existing results DataFrame
    new_row_gb = pd.DataFrame({
        "Model": ["Gradient Boosting - Win/Loss"],
        "Best Score": [grid_search_gb.best_score_],
        "Best Params": [best_params_grid_gb],
        "Win Accuracy": [win_accuracy_gb],
        "Loss Accuracy": [loss_accuracy_gb],
        "Test Accuracy": [test_accuracy_gb]
    })

    # Append the new row with Gradient Boosting results to the existing DataFrame
    results_df_win_notwin = pd.concat([results_df_win_notwin, new_row_gb], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    results_df_win_notwin.to_csv("win_notwin_tuning_results.csv", index=False)

# Display the updated results DataFrame
results_df_win_notwin


Best parameters from GridSearchCV (Gradient Boosting): {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
Best score from GridSearchCV (Gradient Boosting): 0.6378
Test set accuracy (Gradient Boosting): 0.6605


  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,Model,Best Score,Best Params,Win Accuracy,Loss Accuracy,Test Accuracy
0,XGBoost - Win/Loss,0.615349,"{'classifier__colsample_bytree': 0.8, 'classif...",0.642857,0.657609,0.651235
1,XGBoost - Win/Loss,0.583391,"{'classifier__colsample_bytree': 0.8, 'classif...",0.65,0.679348,0.666667
2,Random Forest - Win/Loss,0.640983,"{'classifier__bootstrap': True, 'classifier__m...",0.578571,0.701087,0.648148
3,Logistic Regression - Win/Loss,0.644166,"{'classifier__C': 0.01, 'classifier__class_wei...",0.6,0.73913,0.679012
4,K-Nearest Neighbors - Win/Loss,0.628189,"{'classifier__metric': 'manhattan', 'classifie...",0.564286,0.695652,0.638889
5,LGBM - Win/Loss,0.614316,"{'classifier__colsample_bytree': 0.8, 'classif...",0.614286,0.695652,0.660494
6,AdaBoost - Win/Loss,0.633513,"{'classifier__algorithm': 'SAMME.R', 'classifi...",0.457143,0.842391,0.675926
7,Gradient Boosting - Win/Loss,0.61113,"{'classifier__learning_rate': 0.01, 'classifie...",0.421429,0.842391,0.660494


### I've tested few models and i'll go with XGB or Logistic Regression for my GameApp idea.

### 5. Checking feature importance

#### 5.1 XGBoost

In [61]:
# Get the best XGBoost model from GridSearchCV
best_xgb_model = grid_search_xgb.best_estimator_

# Access feature importances from the XGBoost classifier
importances = best_xgb_model.named_steps['classifier'].feature_importances_

# Access the fitted preprocessor from the pipeline to get feature names
preprocessor = best_xgb_model.named_steps['preprocessor']

# Get the feature names after fitting the preprocessor
feature_names = preprocessor.get_feature_names_out()

# Combine feature names with their importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort features by importance (highest to lowest)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top features
feature_importance_df.head(30)  # Show the top 10 features


Unnamed: 0,Feature,Importance
0,num__team_overall,0.064634
4,num__opponent_overall,0.046376
3,num__team_defense,0.045909
1,num__team_attack,0.043411
50,cat__away_team_formation_3-1-4-2,0.040101
7,num__opponent_defense,0.032867
5,num__opponent_attack,0.027755
2,num__team_midfield,0.026232
36,cat__home_team_formation_3-5-2,0.024089
32,cat__home_team_formation_3-4-1-2,0.024059


#### 5.2 Logistic Regression

In [63]:
# Get the best Logistic Regression model from GridSearchCV
best_lr_model = grid_search_lr.best_estimator_

# Access coefficients from the Logistic Regression classifier
coefficients = best_lr_model.named_steps['classifier'].coef_[0]  # For binary classification, only one set of coefficients

# Access the fitted preprocessor from the pipeline to get feature names
preprocessor = best_lr_model.named_steps['preprocessor']

# Get the feature names after fitting the preprocessor
feature_names = preprocessor.get_feature_names_out()

# Combine feature names with their coefficients (importances)
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort features by absolute value of coefficients (highest to lowest)
feature_importance_df['Abs_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display the top features
feature_importance_df.head(30)  # Show the top 10 most important features


Unnamed: 0,Feature,Coefficient,Abs_Coefficient
3,num__team_defense,0.152889,0.152889
1,num__team_attack,0.146626,0.146626
0,num__team_overall,0.113595,0.113595
4,num__opponent_overall,-0.099848,0.099848
7,num__opponent_defense,-0.088748,0.088748
18,num__opponent_xga_last_4_games,0.081919,0.081919
5,num__opponent_attack,-0.081898,0.081898
14,num__sot_last_4_games,-0.081675,0.081675
13,num__sh_last_4_games,0.080247,0.080247
6,num__opponent_midfield,-0.078596,0.078596
