https://www.kaggle.com/datasets/saife245/english-premier-league?resource=download&select=final_dataset.csv

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('final_dataset.csv')

# Selecting and renaming columns
data = data[['FTR','HTGD','ATGD','HTP','ATP','DiffFormPts','DiffPts','HM1', 'HM2','HM3','HM4','HM5','AM1','AM2','AM3','AM4','AM5']]
data.columns = ['Result','HTGD','ATGD','HTP','ATP','DiffFormPts','DiffPts','H1', 'H2','H3','H4','H5','A1','A2','A3','A4','A5']

# Preparing target variable
y = pd.get_dummies(data["Result"])['H']

# Feature engineering
X = data.drop(['Result'], axis=1)
X = pd.get_dummies(X)  # Convert categorical variables into dummy/indicator variables

# Splitting the dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Initializing models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=0)
}

# Training and evaluating models
for name, model in models.items():
    # For XGBoost, performing a simple hyperparameter tuning
    if name == 'XGBoost':
        param_grid = {'n_estimators': [100], 'learning_rate': [0.1], 'max_depth': [3]}
        model = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    y_pred_val = model.predict(X_val_scaled)
    
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_val = accuracy_score(y_val, y_pred_val)
    
    loss_train = log_loss(y_train, y_pred_train)
    loss_val = log_loss(y_val, y_pred_val)
    
    print(f"{name} - Training Accuracy: {acc_train:.4f}, Validation Accuracy: {acc_val:.4f}")
    print(f"{name} - Training Loss: {loss_train:.4f}, Validation Loss: {loss_val:.4f}\n")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Random Forest - Training Accuracy: 0.9799, Validation Accuracy: 0.6067
Random Forest - Training Loss: 0.7246, Validation Loss: 14.1751

XGBoost - Training Accuracy: 0.6811, Validation Accuracy: 0.6374
XGBoost - Training Loss: 11.4942, Validation Loss: 13.0685

Gradient Boosting - Training Accuracy: 0.6901, Validation Accuracy: 0.6374
Gradient Boosting - Training Loss: 11.1714, Validation Loss: 13.0685

Logistic Regression - Training Accuracy: 0.6462, Validation Accuracy: 0.6520
Logistic Regression - Training Loss: 12.7523, Validation Loss: 12.5415

Decision Tree - Training Accuracy: 0.9799, Validation Accuracy: 0.5570
Decision Tree - Training Loss: 0.7246, Validation Loss: 15.9667

