In [3]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import classification_report

print("Initiating HISTORIAN Training for 'Draw-Finder' (Context-Rich)...")

# --- 1. Load Data ---
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)
df_final = df_final.sort_values('Date')

# --- 2. Full Feature Engineering ---
print("Generating all features (Rank and Strength)...")
# [This section contains the code to generate both Temporal League Rank and Team Strength]
def get_season(date):
    if date.month >= 8: return f"{date.year}-{date.year + 1}"
    else: return f"{date.year - 1}-{date.year}"
df_final['Season'] = df_final['Date'].apply(get_season)
ranks_home, ranks_away = [], []
points_cache = defaultdict(lambda: defaultdict(int))
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    current_season = row['Season']; home_team, away_team = row['HomeTeam'], row['AwayTeam']
    season_points = points_cache[current_season]
    standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
    rank_map = {team: rank + 1 for rank, (team, points) in enumerate(standings)}
    ranks_home.append(rank_map.get(home_team, 20)); ranks_away.append(rank_map.get(away_team, 20))
    if row['FTR'] == 'H': points_cache[current_season][home_team] += 3
    elif row['FTR'] == 'A': points_cache[current_season][away_team] += 3
    else: points_cache[current_season][home_team] += 1; points_cache[current_season][away_team] += 1
df_final['HomeTeam_League_Rank'] = ranks_home; df_final['AwayTeam_League_Rank'] = ranks_away
strength_ratings = defaultdict(lambda: 1500)
K = 30; home_strengths, away_strengths = [], []
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    home_rating, away_rating = strength_ratings[home_team], strength_ratings[away_team]
    home_strengths.append(home_rating); away_strengths.append(away_rating)
    expected_home = 1 / (1 + 10**((away_rating - home_rating) / 400))
    if row['FTR'] == 'H': actual_home = 1
    elif row['FTR'] == 'A': actual_home = 0
    else: actual_home = 0.5
    new_home_rating = home_rating + K * (actual_home - expected_home)
    new_away_rating = away_rating + K * ((1 - actual_home) - (1 - expected_home))
    strength_ratings[home_team] = new_home_rating; strength_ratings[away_team] = new_away_rating
df_final['HomeTeam_Strength'] = home_strengths; df_final['AwayTeam_Strength'] = away_strengths

# --- 3. Prepare Data for HISTORIAN Model ---
df_final['is_draw'] = np.where(df_final['FTR'] == 'D', 1, 0)
form_features = [c for c in df_final.columns if 'form' in c]
h2h_features = [c for c in df_final.columns if 'H2H' in c]
odds_features = [c for c in df_final.columns if 'Avg_Odds' in c]
# The Historian uses ALL features, including team names
final_feature_list = ['HomeTeam', 'AwayTeam'] + odds_features + form_features + h2h_features + ['HomeTeam_League_Rank', 'AwayTeam_League_Rank', 'HomeTeam_Strength', 'AwayTeam_Strength']

X = df_final[final_feature_list].copy()
y = df_final['is_draw']

home_encoder = LabelEncoder(); away_encoder = LabelEncoder()
X['HomeTeam'] = home_encoder.fit_transform(X['HomeTeam'])
X['AwayTeam'] = away_encoder.fit_transform(X['AwayTeam'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sm = SMOTE(random_state=42); X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# --- 4. Train and Save HISTORIAN Model ---
print("\nTraining HISTORIAN Draw-Finder model...")
scale_pos_weight = np.sum(y_train_res == 0) / np.sum(y_train_res == 1)
draw_model_historian = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False)
draw_model_historian.fit(X_train_res, y_train_res)

print("\n--- Historian Model Evaluation ---")
y_pred = draw_model_historian.predict(X_test)
print(classification_report(y_test, y_pred))

with open('draw_model_historian.pkl', 'wb') as file:
    pickle.dump(draw_model_historian, file)
print("\nHISTORIAN 'Draw-Finder' model saved successfully!")


  df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)


Initiating HISTORIAN Training for 'Draw-Finder' (Context-Rich)...
Generating all features (Rank and Strength)...


100%|██████████| 9232/9232 [00:00<00:00, 23063.86it/s]
100%|██████████| 9232/9232 [00:00<00:00, 33553.76it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training HISTORIAN Draw-Finder model...

--- Historian Model Evaluation ---
              precision    recall  f1-score   support

           0       0.76      0.94      0.84      1394
           1       0.34      0.09      0.15       453

    accuracy                           0.73      1847
   macro avg       0.55      0.52      0.49      1847
weighted avg       0.66      0.73      0.67      1847


HISTORIAN 'Draw-Finder' model saved successfully!


In [4]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import classification_report

print("Initiating HISTORIAN Training for 'Winner-Picker' (Context-Rich)...")

# --- 1. Load Data & 2. Full Feature Engineering ---
# [This section is identical to the historian draw model script]
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)
df_final = df_final.sort_values('Date')
print("Generating all features (Rank and Strength)...")
def get_season(date):
    if date.month >= 8: return f"{date.year}-{date.year + 1}"
    else: return f"{date.year - 1}-{date.year}"
df_final['Season'] = df_final['Date'].apply(get_season)
ranks_home, ranks_away = [], []
points_cache = defaultdict(lambda: defaultdict(int))
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    current_season = row['Season']; home_team, away_team = row['HomeTeam'], row['AwayTeam']
    season_points = points_cache[current_season]
    standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
    rank_map = {team: rank + 1 for rank, (team, points) in enumerate(standings)}
    ranks_home.append(rank_map.get(home_team, 20)); ranks_away.append(rank_map.get(away_team, 20))
    if row['FTR'] == 'H': points_cache[current_season][home_team] += 3
    elif row['FTR'] == 'A': points_cache[current_season][away_team] += 3
    else: points_cache[current_season][home_team] += 1; points_cache[current_season][away_team] += 1
df_final['HomeTeam_League_Rank'] = ranks_home; df_final['AwayTeam_League_Rank'] = ranks_away
strength_ratings = defaultdict(lambda: 1500)
K = 30; home_strengths, away_strengths = [], []
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    home_rating, away_rating = strength_ratings[home_team], strength_ratings[away_team]
    home_strengths.append(home_rating); away_strengths.append(away_rating)
    expected_home = 1 / (1 + 10**((away_rating - home_rating) / 400))
    if row['FTR'] == 'H': actual_home = 1
    elif row['FTR'] == 'A': actual_home = 0
    else: actual_home = 0.5
    new_home_rating = home_rating + K * (actual_home - expected_home)
    new_away_rating = away_rating + K * ((1 - actual_home) - (1 - expected_home))
    strength_ratings[home_team] = new_home_rating; strength_ratings[away_team] = new_away_rating
df_final['HomeTeam_Strength'] = home_strengths; df_final['AwayTeam_Strength'] = away_strengths


# --- 3. Prepare Data for HISTORIAN Model ---
non_draw_df = df_final[df_final['FTR'] != 'D'].copy()
form_features = [c for c in non_draw_df.columns if 'form' in c]
h2h_features = [c for c in non_draw_df.columns if 'H2H' in c]
odds_features = [c for c in non_draw_df.columns if 'Avg_Odds' in c]
final_feature_list = ['HomeTeam', 'AwayTeam'] + odds_features + form_features + h2h_features + ['HomeTeam_League_Rank', 'AwayTeam_League_Rank', 'HomeTeam_Strength', 'AwayTeam_Strength']

X = non_draw_df[final_feature_list].copy()
y = non_draw_df['FTR']

home_encoder = LabelEncoder(); away_encoder = LabelEncoder(); y_encoder = LabelEncoder()
X['HomeTeam'] = home_encoder.fit_transform(X['HomeTeam'])
X['AwayTeam'] = away_encoder.fit_transform(X['AwayTeam'])
y_encoded = y_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# --- 4. Train and Save HISTORIAN Model ---
print("\nTraining HISTORIAN Winner-Picker model...")
win_loss_model_historian = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, use_label_encoder=False)
win_loss_model_historian.fit(X_train, y_train)

print("\n--- Historian Model Evaluation ---")
y_pred = win_loss_model_historian.predict(X_test)
print(classification_report(y_test, y_pred, target_names=y_encoder.classes_))

with open('win_lose_model_historian.pkl', 'wb') as file:
    pickle.dump(win_loss_model_historian, file)
print("\nHISTORIAN 'Winner-Picker' model saved successfully!")


  df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)


Initiating HISTORIAN Training for 'Winner-Picker' (Context-Rich)...
Generating all features (Rank and Strength)...


100%|██████████| 9232/9232 [00:00<00:00, 27943.42it/s]
100%|██████████| 9232/9232 [00:00<00:00, 34936.13it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training HISTORIAN Winner-Picker model...

--- Historian Model Evaluation ---
              precision    recall  f1-score   support

           A       0.64      0.57      0.60       549
           H       0.74      0.80      0.77       845

    accuracy                           0.71      1394
   macro avg       0.69      0.68      0.68      1394
weighted avg       0.70      0.71      0.70      1394


HISTORIAN 'Winner-Picker' model saved successfully!


In [5]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import classification_report

print("Initiating STRATEGIST Training for 'Draw-Finder' (Context-Agnostic)...")

# --- 1. Load Data & 2. Full Feature Engineering ---
# [This section is identical to the historian scripts]
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)
df_final = df_final.sort_values('Date')
print("Generating all features (Rank and Strength)...")
def get_season(date):
    if date.month >= 8: return f"{date.year}-{date.year + 1}"
    else: return f"{date.year - 1}-{date.year}"
df_final['Season'] = df_final['Date'].apply(get_season)
ranks_home, ranks_away = [], []
points_cache = defaultdict(lambda: defaultdict(int))
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    current_season = row['Season']; home_team, away_team = row['HomeTeam'], row['AwayTeam']
    season_points = points_cache[current_season]
    standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
    rank_map = {team: rank + 1 for rank, (team, points) in enumerate(standings)}
    ranks_home.append(rank_map.get(home_team, 20)); ranks_away.append(rank_map.get(away_team, 20))
    if row['FTR'] == 'H': points_cache[current_season][home_team] += 3
    elif row['FTR'] == 'A': points_cache[current_season][away_team] += 3
    else: points_cache[current_season][home_team] += 1; points_cache[current_season][away_team] += 1
df_final['HomeTeam_League_Rank'] = ranks_home; df_final['AwayTeam_League_Rank'] = ranks_away
strength_ratings = defaultdict(lambda: 1500)
K = 30; home_strengths, away_strengths = [], []
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    home_rating, away_rating = strength_ratings[home_team], strength_ratings[away_team]
    home_strengths.append(home_rating); away_strengths.append(away_rating)
    expected_home = 1 / (1 + 10**((away_rating - home_rating) / 400))
    if row['FTR'] == 'H': actual_home = 1
    elif row['FTR'] == 'A': actual_home = 0
    else: actual_home = 0.5
    new_home_rating = home_rating + K * (actual_home - expected_home)
    new_away_rating = away_rating + K * ((1 - actual_home) - (1 - expected_home))
    strength_ratings[home_team] = new_home_rating; strength_ratings[away_team] = new_away_rating
df_final['HomeTeam_Strength'] = home_strengths; df_final['AwayTeam_Strength'] = away_strengths


# --- 3. Prepare Data for STRATEGIST Model ---
df_final['is_draw'] = np.where(df_final['FTR'] == 'D', 1, 0)
# THE STRATEGIST IS BLIND TO IDENTITY AND HISTORY
# It only knows about a team's current strength
final_feature_list = ['HomeTeam_Strength', 'AwayTeam_Strength']

X = df_final[final_feature_list].copy()
y = df_final['is_draw']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sm = SMOTE(random_state=42); X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# --- 4. Train and Save STRATEGIST Model ---
print("\nTraining STRATEGIST Draw-Finder model...")
scale_pos_weight = np.sum(y_train_res == 0) / np.sum(y_train_res == 1)
draw_model_strategist = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False)
draw_model_strategist.fit(X_train_res, y_train_res)

print("\n--- Strategist Model Evaluation ---")
y_pred = draw_model_strategist.predict(X_test)
print(classification_report(y_test, y_pred))

with open('draw_model_strategist.pkl', 'wb') as file:
    pickle.dump(draw_model_strategist, file)
print("\nSTRATEGIST 'Draw-Finder' model saved successfully!")


  df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)


Initiating STRATEGIST Training for 'Draw-Finder' (Context-Agnostic)...
Generating all features (Rank and Strength)...


100%|██████████| 9232/9232 [00:00<00:00, 25989.52it/s]
100%|██████████| 9232/9232 [00:00<00:00, 31849.99it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training STRATEGIST Draw-Finder model...

--- Strategist Model Evaluation ---
              precision    recall  f1-score   support

           0       0.77      0.64      0.70      1394
           1       0.27      0.41      0.32       453

    accuracy                           0.58      1847
   macro avg       0.52      0.52      0.51      1847
weighted avg       0.64      0.58      0.60      1847


STRATEGIST 'Draw-Finder' model saved successfully!


In [6]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import classification_report

print("Initiating STRATEGIST Training for 'Winner-Picker' (Context-Agnostic)...")

# --- 1. Load Data & 2. Full Feature Engineering ---
# [This section is identical to the other scripts]
df_final = pd.read_csv("full_feature_dataset_expanded.csv")
df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)
df_final = df_final.sort_values('Date')
print("Generating all features (Rank and Strength)...")
def get_season(date):
    if date.month >= 8: return f"{date.year}-{date.year + 1}"
    else: return f"{date.year - 1}-{date.year}"
df_final['Season'] = df_final['Date'].apply(get_season)
ranks_home, ranks_away = [], []
points_cache = defaultdict(lambda: defaultdict(int))
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    current_season = row['Season']; home_team, away_team = row['HomeTeam'], row['AwayTeam']
    season_points = points_cache[current_season]
    standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
    rank_map = {team: rank + 1 for rank, (team, points) in enumerate(standings)}
    ranks_home.append(rank_map.get(home_team, 20)); ranks_away.append(rank_map.get(away_team, 20))
    if row['FTR'] == 'H': points_cache[current_season][home_team] += 3
    elif row['FTR'] == 'A': points_cache[current_season][away_team] += 3
    else: points_cache[current_season][home_team] += 1; points_cache[current_season][away_team] += 1
df_final['HomeTeam_League_Rank'] = ranks_home; df_final['AwayTeam_League_Rank'] = ranks_away
strength_ratings = defaultdict(lambda: 1500)
K = 30; home_strengths, away_strengths = [], []
for index, row in tqdm(df_final.iterrows(), total=df_final.shape[0]):
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    home_rating, away_rating = strength_ratings[home_team], strength_ratings[away_team]
    home_strengths.append(home_rating); away_strengths.append(away_rating)
    expected_home = 1 / (1 + 10**((away_rating - home_rating) / 400))
    if row['FTR'] == 'H': actual_home = 1
    elif row['FTR'] == 'A': actual_home = 0
    else: actual_home = 0.5
    new_home_rating = home_rating + K * (actual_home - expected_home)
    new_away_rating = away_rating + K * ((1 - actual_home) - (1 - expected_home))
    strength_ratings[home_team] = new_home_rating; strength_ratings[away_team] = new_away_rating
df_final['HomeTeam_Strength'] = home_strengths; df_final['AwayTeam_Strength'] = away_strengths


# --- 3. Prepare Data for STRATEGIST Model ---
non_draw_df = df_final[df_final['FTR'] != 'D'].copy()
# THE STRATEGIST IS BLIND TO IDENTITY AND HISTORY
final_feature_list = ['HomeTeam_Strength', 'AwayTeam_Strength']

X = non_draw_df[final_feature_list].copy()
y = non_draw_df['FTR']

y_encoder = LabelEncoder()
y_encoded = y_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# --- 4. Train and Save STRATEGIST Model ---
print("\nTraining STRATEGIST Winner-Picker model...")
win_loss_model_strategist = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, use_label_encoder=False)
win_loss_model_strategist.fit(X_train, y_train)

print("\n--- Strategist Model Evaluation ---")
y_pred = win_loss_model_strategist.predict(X_test)
print(classification_report(y_test, y_pred, target_names=y_encoder.classes_))

with open('win_lose_model_strategist.pkl', 'wb') as file:
    pickle.dump(win_loss_model_strategist, file)
    
print("\nSTRATEGIST 'Winner-Picker' model saved successfully!")


  df_final['Date'] = pd.to_datetime(df_final['Date'], dayfirst=True)


Initiating STRATEGIST Training for 'Winner-Picker' (Context-Agnostic)...
Generating all features (Rank and Strength)...


100%|██████████| 9232/9232 [00:00<00:00, 25851.25it/s]
100%|██████████| 9232/9232 [00:00<00:00, 33791.85it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training STRATEGIST Winner-Picker model...

--- Strategist Model Evaluation ---
              precision    recall  f1-score   support

           A       0.63      0.51      0.56       549
           H       0.72      0.80      0.76       845

    accuracy                           0.69      1394
   macro avg       0.67      0.66      0.66      1394
weighted avg       0.68      0.69      0.68      1394


STRATEGIST 'Winner-Picker' model saved successfully!
