In [4]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os

print("Initiating the final dataset creation process...")

# --- 1. Load the Base Dataset ---
# This should be the dataset that has the Form and H2H features, but not the Rank and Strength.
#script_dir = os.path.dirname(__file__)
try:
    #df_base = pd.read_csv(os.path.join(script_dir, "base_features.csv")) # Make sure you have this base file
    df_base = pd.read_csv("C:/Users/arnas/Desktop/Projects/Match_winner/AI_Match_winner_advanced/base_feature_dataset_expanded.csv")
except FileNotFoundError:
    print("Error: 'base_features.csv' not found. Please ensure your initial dataset is named correctly.")
    exit()

df_base['Date'] = pd.to_datetime(df_base['Date'], dayfirst=True)
df_base = df_base.sort_values('Date').reset_index(drop=True)
print("Base dataset loaded and sorted by date.")

# --- 2. Generate Temporal League Rank ---
print("Generating Temporal League Rank for each match...")

def get_season(date):
    if date.month >= 8:
        return f"{date.year}-{date.year + 1}"
    else:
        return f"{date.year - 1}-{date.year}"

df_base['Season'] = df_base['Date'].apply(get_season)

ranks_home, ranks_away = [], []
points_cache = defaultdict(lambda: defaultdict(int))

for index, row in tqdm(df_base.iterrows(), total=df_base.shape[0], desc="Calculating Ranks"):
    current_season = row['Season']
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    
    # Get ranks *before* the current match
    season_points = points_cache[current_season]
    standings = sorted(season_points.items(), key=lambda item: item[1], reverse=True)
    rank_map = {team: rank + 1 for rank, (team, points) in enumerate(standings)}
    
    ranks_home.append(rank_map.get(home_team, 15)) # Default to mid-table
    ranks_away.append(rank_map.get(away_team, 15))
    
    # Update points *after* the match
    if row['FTR'] == 'H':
        points_cache[current_season][home_team] += 3
    elif row['FTR'] == 'A':
        points_cache[current_season][away_team] += 3
    elif row['FTR'] == 'D':
        points_cache[current_season][home_team] += 1
        points_cache[current_season][away_team] += 1

df_base['HomeTeam_League_Rank'] = ranks_home
df_base['AwayTeam_League_Rank'] = ranks_away
print("Temporal League Rank columns added.")

# --- 3. Generate Temporal Team Strength (Elo-style) ---
print("Generating Temporal Team Strength for each match...")

strength_home, strength_away = [], []
strength_cache = defaultdict(lambda: 1500) # Start all teams at a default 1500 rating
K_FACTOR = 30

for index, row in tqdm(df_base.iterrows(), total=df_base.shape[0], desc="Calculating Strengths"):
    home_team, away_team = row['HomeTeam'], row['AwayTeam']
    
    # Get strength *before* the current match
    home_rating = strength_cache[home_team]
    away_rating = strength_cache[away_team]
    strength_home.append(home_rating)
    strength_away.append(away_rating)
    
    # Calculate expected outcomes
    expected_home = 1 / (1 + 10**((away_rating - home_rating) / 400))
    expected_away = 1 - expected_home
    
    # Get actual outcomes
    if row['FTR'] == 'H':
        actual_home, actual_away = 1, 0
    elif row['FTR'] == 'A':
        actual_home, actual_away = 0, 1
    else: # Draw
        actual_home, actual_away = 0.5, 0.5
        
    # Update ratings *after* the match
    new_home_rating = home_rating + K_FACTOR * (actual_home - expected_home)
    new_away_rating = away_rating + K_FACTOR * (actual_away - expected_away)
    strength_cache[home_team] = new_home_rating
    strength_cache[away_team] = new_away_rating

df_base['HomeTeam_Strength'] = strength_home
df_base['AwayTeam_Strength'] = strength_away
print("Temporal Team Strength columns added.")

# --- 4. Save the Final, Expanded Dataset ---
final_filename = 'full_feature_dataset_expanded.csv'
df_base.to_csv(final_filename, index=False)
print(f"\n✅ Success! The final dataset has been saved as '{final_filename}'.")
print("You can now run your training scripts again.")

  df_base['Date'] = pd.to_datetime(df_base['Date'], dayfirst=True)


Initiating the final dataset creation process...
Base dataset loaded and sorted by date.
Generating Temporal League Rank for each match...


Calculating Ranks: 100%|██████████| 9232/9232 [00:00<00:00, 21617.38it/s]


Temporal League Rank columns added.
Generating Temporal Team Strength for each match...


Calculating Strengths: 100%|██████████| 9232/9232 [00:00<00:00, 27465.94it/s]


Temporal Team Strength columns added.

✅ Success! The final dataset has been saved as 'full_feature_dataset_expanded.csv'.
You can now run your training scripts again.


In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pickle
import os

print("Initiating definitive evaluation for the PURE CATBOOST MODEL...")

# --- Step 1: Load and Split Data ---
print("\n[STEP 1/3] Loading data and creating a unified train/test split...")
current_dir = os.getcwd()
df_final = pd.read_csv(os.path.join(current_dir, "full_feature_dataset_expanded.csv"))
df_final = df_final.dropna()

feature_list = [
    'HomeTeam', 'AwayTeam', 'Season', 'HTHG', 'HTAG', 'HS', 'AS', 'AST', 'HST', 
    'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'HF', 'AF', 
    'HomeTeam_League_Rank', 'AwayTeam_League_Rank', 'HomeTeam_Strength', 'AwayTeam_Strength'
] + [col for col in df_final.columns if 'Avg_Odds' in col or 'form' in col or 'H2H' in col]

X = df_final[feature_list].copy()
y = df_final['FTR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data split successfully.")


# --- Step 2: Train and Evaluate the Pure CatBoost Model ---
print("\n[STEP 2/3] Training and evaluating the Pure CatBoost model on the test set...")
cat_features = ['HomeTeam', 'AwayTeam', 'Season']
le_target = LabelEncoder()
y_train_encoded = le_target.fit_transform(y_train)
y_test_encoded = le_target.transform(y_test)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=7,
    loss_function='MultiClass',
    auto_class_weights='Balanced',
    early_stopping_rounds=50,
    verbose=100
)
model.fit(X_train, y_train_encoded, cat_features=cat_features, eval_set=(X_test, y_test_encoded))

y_pred_encoded = model.predict(X_test)
y_pred = le_target.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n-------------------------------------------------")
print(f"Pure CatBoost System Accuracy: {accuracy*100:.2f}%")
print(f"-------------------------------------------------")
print("\nPure CatBoost System Classification Report:")
print(classification_report(y_test, y_pred))


# --- Step 3: Save the Final Assets (Optional, if you choose this model) ---
print("\n[STEP 3/3] Assembling and saving the final asset toolkit for this model...")
final_assets = {
    'model': model,
    'target_encoder': le_target,
}
file_path = 'pure_catboost_assets.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(final_assets, file)

print(f"\n✅ Success! The Pure CatBoost model toolkit has been saved to '{file_path}'.")

Initiating definitive evaluation for the PURE CATBOOST MODEL...

[STEP 1/3] Loading data and creating a unified train/test split...
Data split successfully.

[STEP 2/3] Training and evaluating the Pure CatBoost model on the test set...
0:	learn: 1.0755762	test: 1.0757772	best: 1.0757772 (0)	total: 204ms	remaining: 3m 23s
100:	learn: 0.7523771	test: 0.7996716	best: 0.7995851 (99)	total: 5s	remaining: 44.5s
200:	learn: 0.6893679	test: 0.7899262	best: 0.7898549 (191)	total: 9.9s	remaining: 39.4s
300:	learn: 0.6234841	test: 0.7887683	best: 0.7880250 (273)	total: 15s	remaining: 34.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7880250326
bestIteration = 273

Shrink model to first 274 iterations.

-------------------------------------------------
Pure CatBoost System Accuracy: 64.59%
-------------------------------------------------

Pure CatBoost System Classification Report:
              precision    recall  f1-score   support

           A       0.72      0.69    

  y = column_or_1d(y, warn=True)
