In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle

print("Initiating Draw-Finder model training...")

df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['is_draw'] = np.where(df_final['FTR'] == 'D', 1, 0)


form_feature_names = [col for col in df_final.columns if 'form' in col]
h2h_feature_names = [col for col in df_final.columns if 'H2H' in col]
odds_feature_names = [col for col in df_final.columns if 'Avg_Odds' in col]
final_feature_list = ['HomeTeam', 'AwayTeam'] + odds_feature_names + form_feature_names + h2h_feature_names

X = df_final[final_feature_list].copy()
y = df_final['is_draw']

# --- 2. Fit and Save Encoders ---
home_team_encoder_draw = LabelEncoder()
away_team_encoder_draw = LabelEncoder()

X['HomeTeam'] = home_team_encoder_draw.fit_transform(X['HomeTeam'])
X['AwayTeam'] = away_team_encoder_draw.fit_transform(X['AwayTeam'])

print("\nSaving the label encoders for draw model...")
encoders_to_save_draw = {
    'home_team': home_team_encoder_draw,
    'away_team': away_team_encoder_draw,
}
with open('draw_encoders.pkl', 'wb') as file:
    pickle.dump(encoders_to_save_draw, file)
print("Encoders saved successfully to 'draw_encoders.pkl'")


# --- 3. Split and SMOTE ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Applying SMOTE to the training data...")
sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("SMOTE complete.")
param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2]
}

scale_pos_weight = np.sum(y_train_res == 0) / np.sum(y_train_res == 1)

# Create the XGBoost model instance
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

# Create the GridSearchCV object
# We use 'f1' as the scoring metric to optimize for a balance of precision and recall on the 'Draw' class
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=3, # 3-fold cross-validation
    verbose=1,
    n_jobs=-1 # Use all available CPU cores
)

print("Starting the hyperparameter search... This may take some time.")
grid_search.fit(X_train_res, y_train_res)

print("\nHyperparameter search complete.")
print("Best parameters found: ", grid_search.best_params_)

# --- 4. Evaluate the BEST Model ---
print("\nEvaluating the best model found by the search...")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# --- 5. Save Model ---
print("\nSaving the Draw-Finder model...")
with open('draw_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print("Model saved successfully as 'draw_model.pkl'")
print("\n--- Training and saving complete! ---")



Initiating Draw-Finder model training...

Saving the label encoders for draw model...
Encoders saved successfully to 'draw_encoders.pkl'
Applying SMOTE to the training data...
SMOTE complete.
Starting the hyperparameter search... This may take some time.
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Hyperparameter search complete.
Best parameters found:  {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}

Evaluating the best model found by the search...

Saving the Draw-Finder model...
Model saved successfully as 'draw_model.pkl'

--- Training and saving complete! ---


In [5]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

# --- 1. Prepare Data ---
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
form_feature_names = [col for col in df_final.columns if 'form' in col]
h2h_feature_names = [col for col in df_final.columns if 'H2H' in col]
odds_feature_names = [col for col in df_final.columns if 'Avg_Odds' in col]
final_feature_list = ['HomeTeam', 'AwayTeam'] + odds_feature_names + form_feature_names + h2h_feature_names

# Filter the original dataset to only include non-draws
non_draw_df = df_final[df_final['FTR'] != 'D'].copy()
X_non_draw = non_draw_df[final_feature_list].copy()
y_non_draw = non_draw_df['FTR']

# --- 2. Encode and Save Encoders ---
home_team_encoder_wl = LabelEncoder()
away_team_encoder_wl = LabelEncoder()
y_encoder_wl = LabelEncoder() # This is the crucial addition

X_non_draw['HomeTeam'] = home_team_encoder_wl.fit_transform(X_non_draw['HomeTeam'])
X_non_draw['AwayTeam'] = away_team_encoder_wl.fit_transform(X_non_draw['AwayTeam'])
y_non_draw_encoded = y_encoder_wl.fit_transform(y_non_draw) # H=1, A=0

# Save all encoders in a dictionary
win_loss_encoders = {
    'home_team': home_team_encoder_wl,
    'away_team': away_team_encoder_wl,
    'y_encoder_wl': y_encoder_wl  # THE FIX IS HERE!
}
with open('win_loss_encoders.pkl', 'wb') as file:
    pickle.dump(win_loss_encoders, file)
print("Saved win/loss team and target encoders to 'win_loss_encoders.pkl'")

# --- 3. Train the Model ---
print("\nTraining Winner-Picker model...")
winner_picker_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
winner_picker_model.fit(X_non_draw, y_non_draw_encoded)
print("Winner-Picker model training complete.")

# --- 4. Save the Model ---
with open('win_lose_model.pkl', 'wb') as file:
    pickle.dump(winner_picker_model, file)
print("Saved final Winner-Picker model to 'win_lose_model.pkl'")


Saved win/loss team and target encoders to 'win_loss_encoders.pkl'

Training Winner-Picker model...
Winner-Picker model training complete.
Saved final Winner-Picker model to 'win_lose_model.pkl'
