THIS IS A MATCH PREDICTION MODEL DEPLOYED TO PREDICT BEFORE THE MATCH STARTS

In [1]:
import pandas as pd
import numpy as np
import glob
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support


In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Initiating Hyperparameter Tuning for 'Draw-Finder'...")

# --- 1. Load and Prepare Data (as before) ---
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['is_draw'] = np.where(df_final['FTR'] == 'D', 1, 0)

form_feature_names = [col for col in df_final.columns if 'form' in col]
h2h_feature_names = [col for col in df_final.columns if 'H2H' in col]
odds_feature_names = [col for col in df_final.columns if 'Avg_Odds' in col]
final_feature_list = ['HomeTeam', 'AwayTeam','HTHG','HTAG','HS','AS','AST','HST','HC','AC','HY','AY','HR','AR','HF','AF'] + odds_feature_names + form_feature_names + h2h_feature_names

X = df_final[final_feature_list].copy()
y = df_final['is_draw']

X['HomeTeam'] = LabelEncoder().fit_transform(X['HomeTeam'])
X['AwayTeam'] = LabelEncoder().fit_transform(X['AwayTeam'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- 2. Apply SMOTE (as before) ---
print("Applying SMOTE to the training data...")
sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("SMOTE complete. New training set size:", Counter(y_train_res))

# --- 3. Hyperparameter Tuning with GridSearchCV ---
print("\nSetting up GridSearchCV for XGBoost...")
# =cooment
# Define the grid of hyperparameters to search
param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2]
}

# We still use scale_pos_weight for imbalanced data
scale_pos_weight = np.sum(y_train_res == 0) / np.sum(y_train_res == 1)

# Create the XGBoost model instance
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

# Create the GridSearchCV object
# We use 'f1' as the scoring metric to optimize for a balance of precision and recall on the 'Draw' class
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=3, # 3-fold cross-validation
    verbose=1,
    n_jobs=-1 # Use all available CPU cores
)

print("Starting the hyperparameter search... This may take some time.")
grid_search.fit(X_train_res, y_train_res)

print("\nHyperparameter search complete.")
print("Best parameters found: ", grid_search.best_params_)

# --- 4. Evaluate the BEST Model ---
print("\nEvaluating the best model found by the search...")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n-------------------------------------------------")
print(f"Tuned Draw-Finder Model Test Accuracy: {accuracy*100:.2f}%")
print(f"-------------------------------------------------")
print("\nTuned Draw-Finder Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not-Draw (0)', 'Draw (1)']))

Initiating Hyperparameter Tuning for 'Draw-Finder'...
Applying SMOTE to the training data...
SMOTE complete. New training set size: Counter({1: 5572, 0: 5572})

Setting up GridSearchCV for XGBoost...
Starting the hyperparameter search... This may take some time.
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Hyperparameter search complete.
Best parameters found:  {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}

Evaluating the best model found by the search...

-------------------------------------------------
Tuned Draw-Finder Model Test Accuracy: 74.99%
-------------------------------------------------

Tuned Draw-Finder Classification Report:
              precision    recall  f1-score   support

Not-Draw (0)       0.77      0.96      0.85      1394
    Draw (1)       0.45      0.09      0.15       453

    accuracy                           0.75      1847
   macro avg       0.61      0.53      0.50      1847
weighted avg       0.69      0.75      0.68    

In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

print("Initiating full 'Draw Specialist' system evaluation...")

# --- 1. Load and Prepare the Full Enriched Dataset ---
df_final = pd.read_csv("full_feature_dataset1.csv")

# Create the explicit feature list
form_feature_names = [col for col in df_final.columns if 'form' in col]
h2h_feature_names = [col for col in df_final.columns if 'H2H' in col]
odds_feature_names = [col for col in df_final.columns if 'Avg_Odds' in col]
final_feature_list = ['HomeTeam', 'AwayTeam'] + odds_feature_names + form_feature_names + h2h_feature_names

X = df_final[final_feature_list].copy()
y_multi_class = df_final['FTR'] # The original 'A', 'D', 'H' labels

# Preprocess categorical features once
X['HomeTeam'] = LabelEncoder().fit_transform(X['HomeTeam'])
X['AwayTeam'] = LabelEncoder().fit_transform(X['AwayTeam'])

# --- 2. Prepare Data for Model A (Draw-Finder) ---
y_is_draw = np.where(y_multi_class == 'D', 1, 0)
X_train, X_test, y_train_is_draw, y_test_is_draw = train_test_split(X, y_is_draw, test_size=0.2, random_state=42, stratify=y_is_draw)

# --- 3. Train Model A (Draw-Finder) ---
print("\nTraining Model A (Draw-Finder)...")
scale_pos_weight_draw = np.sum(y_train_is_draw == 0) / np.sum(y_train_is_draw == 1)
draw_finder_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight_draw)
draw_finder_model.fit(X_train, y_train_is_draw)

# --- 4. Prepare Data for Model B (Winner-Picker) ---
# Filter the original dataset to only include non-draws
non_draw_df = df_final[df_final['FTR'] != 'D'].copy()
X_non_draw = non_draw_df[final_feature_list].copy()
y_non_draw = non_draw_df['FTR']

# Preprocess for the second model
X_non_draw['HomeTeam'] = LabelEncoder().fit_transform(X_non_draw['HomeTeam'])
X_non_draw['AwayTeam'] = LabelEncoder().fit_transform(X_non_draw['AwayTeam'])
y_non_draw_encoded = LabelEncoder().fit_transform(y_non_draw) # H=1, A=0

# --- 5. Train Model B (Winner-Picker) ---
print("\nTraining Model B (Winner-Picker)...")
winner_picker_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
winner_picker_model.fit(X_non_draw, y_non_draw_encoded)

# --- 6. Execute Full Pipeline on the Test Set ---
print("\nExecuting full prediction pipeline on the test set...")
# First, get predictions from the Draw-Finder
draw_preds = draw_finder_model.predict(X_test)

# Get predictions from the Winner-Picker for all test samples
# (We'll only use them if the Draw-Finder says "Not-Draw")
winner_preds_raw = winner_picker_model.predict(X_test)

# Reconstruct the final multi-class predictions
final_preds = []
for i in range(len(X_test)):
    if draw_preds[i] == 1:
        final_preds.append('D') # Draw-Finder predicts a Draw
    else:
        # Winner-Picker decides between Home and Away
        if winner_preds_raw[i] == 1: # Assuming H was encoded as 1
            final_preds.append('H')
        else:
            final_preds.append('A')

# --- 7. Final System Evaluation ---
# We need to compare our final_preds ('A','D','H') with the original multi-class test labels
y_test_multi_class = LabelEncoder().fit_transform(y_multi_class) # Ensure it's encoded for splitting
_, y_test_original_labels = train_test_split(y_multi_class, test_size=0.2, random_state=42, stratify=y_is_draw)


accuracy = accuracy_score(y_test_original_labels, final_preds)
print(f"\n-------------------------------------------------")
print(f"Final 'Draw Specialist' System Accuracy: {accuracy*100:.2f}%")
print(f"-------------------------------------------------")
print("\nFinal System Classification Report:")
print(classification_report(y_test_original_labels, final_preds))

Initiating full 'Draw Specialist' system evaluation...

Training Model A (Draw-Finder)...

Training Model B (Winner-Picker)...

Executing full prediction pipeline on the test set...

-------------------------------------------------
Final 'Draw Specialist' System Accuracy: 66.54%
-------------------------------------------------

Final System Classification Report:
              precision    recall  f1-score   support

           A       0.81      0.82      0.82       540
           D       0.29      0.25      0.27       453
           H       0.74      0.79      0.76       854

    accuracy                           0.67      1847
   macro avg       0.61      0.62      0.61      1847
weighted avg       0.65      0.67      0.66      1847

