In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [None]:
try:
    try:
        df = pd.read_csv("merged_dataset.csv", encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv("merged_dataset.csv", encoding='ISO-8859-1')

    print("Initiating the final dataset creation process...")
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
    df.dropna(subset=['Date', 'HomeTeam', 'AwayTeam', 'FTR'], inplace=True)
    df.sort_values(by='Date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Base dataset loaded and sorted by date.")
    
    # --- Betting Odds ---
    home_odds_cols = [col for col in ['B365H', 'WHH', 'LBH'] if col in df.columns]
    draw_odds_cols = [col for col in ['B365D', 'WHD', 'LBD'] if col in df.columns]
    away_odds_cols = [col for col in ['B365A', 'WHA', 'LBA'] if col in df.columns]
    df['Avg_Odds_H'] = df[home_odds_cols].mean(axis=1)
    df['Avg_Odds_D'] = df[draw_odds_cols].mean(axis=1)
    df['Avg_Odds_A'] = df[away_odds_cols].mean(axis=1)
    # Simple forward-fill for any remaining missing odds
    df[['Avg_Odds_H', 'Avg_Odds_D', 'Avg_Odds_A']] = df[['Avg_Odds_H', 'Avg_Odds_D', 'Avg_Odds_A']].fillna(method='ffill')

    # --- Team Form ---
    stats_cols = ['FTHG', 'FTAG', 'HTHG', 'HTAG' ,'HS', 'AS', 'HF', 'AF', 'HC', 'AC', 'HST', 'AST', 'HY', 'AY', 'HR', 'AR']
    form_feature_names = [f'H_form_{col}' for col in stats_cols] + [f'A_form_{col}' for col in stats_cols]
    
    # This is a more efficient way to calculate rolling averages for form
    for col in stats_cols:
        df[f'H_form_{col}'] = df.groupby('HomeTeam')[col].transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
        df[f'A_form_{col}'] = df.groupby('AwayTeam')[col].transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
    df.dropna(subset=form_feature_names, inplace=True) # Drop early matches with no form data

    # --- Head-to-Head (H2H) ---
    # This part requires a loop, as it's context-dependent for each match
    h2h_features = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Calculating H2H"):
        home_team, away_team, date = row['HomeTeam'], row['AwayTeam'], row['Date']
        h2h_df = df[((df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)) | ((df['HomeTeam'] == away_team) & (df['AwayTeam'] == home_team))]
        past_h2h = h2h_df[h2h_df['Date'] < date]
        if len(past_h2h) == 0:
            h2h_features.append([0, 0, 0])
            continue
        hw = len(past_h2h[(past_h2h['HomeTeam'] == home_team) & (past_h2h['FTR'] == 'H')])
        aw = len(past_h2h[(past_h2h['AwayTeam'] == home_team) & (past_h2h['FTR'] == 'A')])
        home_wins = hw + aw
        draws = len(past_h2h[past_h2h['FTR'] == 'D'])
        total_games = len(past_h2h)
        h2h_features.append([(home_wins / total_games), ((total_games - home_wins - draws) / total_games), (draws / total_games)])
    
    h2h_df = pd.DataFrame(h2h_features, columns=['H_H2H_win_pct', 'A_H2H_win_pct', 'H2H_draw_pct'], index=df.index)
    df = pd.concat([df, h2h_df], axis=1)

    # --- 4. Feature Engineering: Advanced Temporal Features ---
    print("Generating Temporal League Rank for each match...")
    
    # --- Temporal League Rank ---
    def get_season(date):
        return f"{date.year}-{date.year + 1}" if date.month >= 8 else f"{date.year - 1}-{date.year}"
    df['Season'] = df['Date'].apply(get_season)
    ranks_home, ranks_away = [], []
    points_cache = defaultdict(lambda: defaultdict(int))
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Calculating Ranks"):
        season, home, away = row['Season'], row['HomeTeam'], row['AwayTeam']
        season_points = points_cache[season]
        standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
        rank_map = {team: r + 1 for r, (team, p) in enumerate(standings)}
        ranks_home.append(rank_map.get(home, 15))
        ranks_away.append(rank_map.get(away, 15))
        if row['FTR'] == 'H': points_cache[season][home] += 3
        elif row['FTR'] == 'A': points_cache[season][away] += 3
        else: points_cache[season][home] += 1; points_cache[season][away] += 1
    df['HomeTeam_League_Rank'] = ranks_home
    df['AwayTeam_League_Rank'] = ranks_away
    print("Temporal League Rank columns added.\nGenerating Temporal Team Strength for each match...")
    # --- Temporal Team Strength (Elo) ---
    strength_home, strength_away = [], []
    strength_cache = defaultdict(lambda: 1500)
    K = 30
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Calculating Strengths"):
        home, away = row['HomeTeam'], row['AwayTeam']
        r_h, r_a = strength_cache[home], strength_cache[away]
        strength_home.append(r_h); strength_away.append(r_a)
        e_h = 1 / (1 + 10**((r_a - r_h) / 400))
        e_a = 1 - e_h
        if row['FTR'] == 'H': s_h, s_a = 1, 0
        elif row['FTR'] == 'A': s_h, s_a = 0, 1
        else: s_h, s_a = 0.5, 0.5
        n_r_h = r_h + K * (s_h - e_h)
        n_r_a = r_a + K * (s_a - e_a)
        strength_cache[home], strength_cache[away] = n_r_h, n_r_a
    df['HomeTeam_Strength'] = strength_home
    df['AwayTeam_Strength'] = strength_away
    print("Temporal Team Strength columns added.")
    # --- 5. Final Save ---
    output_path = "full_feature_dataset_expanded.csv"
    df.to_csv(output_path, index=False)
    print("✅ Success! The final dataset has been saved as 'full_feature_dataset_expanded.csv'.\nYou can now run your training scripts again.")
except Exception as e:
    print(f"An error occurred: {e}")

  df_base['Date'] = pd.to_datetime(df_base['Date'], dayfirst=True)


Initiating the final dataset creation process...
Base dataset loaded and sorted by date.
Generating Temporal League Rank for each match...


Calculating Ranks: 100%|██████████| 9232/9232 [00:00<00:00, 21617.38it/s]


Temporal League Rank columns added.
Generating Temporal Team Strength for each match...


Calculating Strengths: 100%|██████████| 9232/9232 [00:00<00:00, 27465.94it/s]


Temporal Team Strength columns added.

✅ Success! The final dataset has been saved as 'full_feature_dataset_expanded.csv'.
You can now run your training scripts again.


In [2]:
# --- Step 1: Load and Split Data ---
print("\n[STEP 1/3] Loading data and creating a unified train/test split...")
current_dir = os.getcwd()
df_final = pd.read_csv(os.path.join(current_dir, "full_feature_dataset_expanded.csv"))
df_final = df_final.dropna()

feature_list = [
    'HomeTeam', 'AwayTeam', 'Season', 'HTHG', 'HTAG', 'HS', 'AS', 'AST', 'HST', 
    'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'HF', 'AF', 
    'HomeTeam_League_Rank', 'AwayTeam_League_Rank', 'HomeTeam_Strength', 'AwayTeam_Strength'
] + [col for col in df_final.columns if 'Avg_Odds' in col or 'form' in col or 'H2H' in col]

X = df_final[feature_list].copy()
y = df_final['FTR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data split successfully.")


# --- Step 2: Train and Evaluate the Pure CatBoost Model ---
print("\n[STEP 2/3] Training and evaluating the Pure CatBoost model on the test set...")
cat_features = ['HomeTeam', 'AwayTeam', 'Season']
le_target = LabelEncoder()
y_train_encoded = le_target.fit_transform(y_train)
y_test_encoded = le_target.transform(y_test)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=7,
    loss_function='MultiClass',
    auto_class_weights='Balanced',
    early_stopping_rounds=50,
    verbose=100
)
model.fit(X_train, y_train_encoded, cat_features=cat_features, eval_set=(X_test, y_test_encoded))

y_pred_encoded = model.predict(X_test)
y_pred = le_target.inverse_transform(y_pred_encoded)



# --- Step 3: Save the Final Assets (Optional, if you choose this model) ---
print("\n[STEP 3/3] Assembling and saving the final asset toolkit for this model...")
final_assets = {
    'model': model,
    'target_encoder': le_target,
}
file_path = 'pure_catboost_assets.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(final_assets, file)

print(f"\n✅ Success! The Pure CatBoost model toolkit has been saved to '{file_path}'.")


[STEP 1/3] Loading data and creating a unified train/test split...
Data split successfully.

[STEP 2/3] Training and evaluating the Pure CatBoost model on the test set...
0:	learn: 1.0755762	test: 1.0757772	best: 1.0757772 (0)	total: 189ms	remaining: 3m 8s
100:	learn: 0.7523771	test: 0.7996716	best: 0.7995851 (99)	total: 4.52s	remaining: 40.2s
200:	learn: 0.6893679	test: 0.7899262	best: 0.7898549 (191)	total: 8.58s	remaining: 34.1s
300:	learn: 0.6234841	test: 0.7887683	best: 0.7880250 (273)	total: 12.6s	remaining: 29.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7880250326
bestIteration = 273

Shrink model to first 274 iterations.

[STEP 3/3] Assembling and saving the final asset toolkit for this model...

✅ Success! The Pure CatBoost model toolkit has been saved to 'pure_catboost_assets.pkl'.


  y = column_or_1d(y, warn=True)


In [3]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\n-------------------------------------------------")
print(f"Pure CatBoost System Accuracy: {accuracy*100:.2f}%")
print(f"-------------------------------------------------")
print("\nPure CatBoost System Classification Report:")
print(classification_report(y_test, y_pred))



-------------------------------------------------
Pure CatBoost System Accuracy: 64.59%
-------------------------------------------------

Pure CatBoost System Classification Report:
              precision    recall  f1-score   support

           A       0.72      0.69      0.70       549
           D       0.39      0.49      0.43       453
           H       0.79      0.70      0.75       845

    accuracy                           0.65      1847
   macro avg       0.63      0.63      0.63      1847
weighted avg       0.67      0.65      0.66      1847

