In [None]:
# ðŸ§  New Predictions Notebook: Clean Deployment-Ready Version

import joblib
import pandas as pd
import numpy as np
from collections import defaultdict, deque
import datetime

# ============================================
# âœ… 1. Load Trained Artifacts
# ============================================
model = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\xgb_model_final.pkl")
trained_columns = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\trained_columns.pkl")
encoders = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\label_encoders.pkl")
global_elo_db = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\global_elo_final.pkl")
surface_elo_db = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\surface_elo_final.pkl")
h2h_db = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\h2h_record_final.pkl")
form_db = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\recent_results_final.pkl")
fatigue_db = joblib.load("E:\Projects\Master Projects (Core)\Tennis Match Prediction\Models, Features and Encoders\match_history_final.pkl")

BASE_ELO = 1500

# ============================================
# âœ… 2. Load and Prepare New Matches
# ============================================

# This is an example. You would typically load from a CSV:
# df_new = pd.read_csv("upcoming_matches.csv")
new_matches_data = {
    'tourney_date': ['2025-06-26', '2025-06-27'],
    'surface': ['Grass', 'Hard'], 'round': ['F', 'SF'], 'tourney_level': ['G', 'M'],
    'draw_size': [128.0, 64.0], 'best_of': [5, 3],
    'player1_id': [104925, 200678], 'player2_id': [104745, 126207],
    'player1_name': ['Carlos Alcaraz', 'Jannik Sinner'],
    'player2_name': ['Novak Djokovic', 'Daniil Medvedev'],
    'p1_rank': [3.0, 1.0], 'p2_rank': [2.0, 5.0],
    'p1_hand': ['R', 'R'], 'p2_hand': ['R', 'R'],
    'p1_ht': [183.0, 188.0], 'p2_ht': [188.0, 198.0],
    'p1_age': [22.1, 23.8], 'p2_age': [38.1, 29.3]
}
df_new = pd.DataFrame(new_matches_data)
df_new['tourney_date'] = pd.to_datetime(df_new['tourney_date'])

# ============================================
# âœ… 3. Feature Engineering Function
# ============================================
def add_engineered_features(df):
    df['p1_global_elo'] = df['player1_id'].map(lambda x: global_elo_db.get(x, BASE_ELO))
    df['p2_global_elo'] = df['player2_id'].map(lambda x: global_elo_db.get(x, BASE_ELO))
    df['elo_diff'] = df['p1_global_elo'] - df['p2_global_elo']

    def get_surface_elo_diff(row):
        p1_surface = surface_elo_db.get(row['player1_id'], {}).get(row['surface'], BASE_ELO)
        p2_surface = surface_elo_db.get(row['player2_id'], {}).get(row['surface'], BASE_ELO)
        return p1_surface - p2_surface
    df['surface_elo_diff'] = df.apply(get_surface_elo_diff, axis=1)

    def get_h2h_winrate(row):
        p1_id, p2_id = row['player1_id'], row['player2_id']
        pair = tuple(sorted([p1_id, p2_id]))
        p1_wins_hist, total_matches = h2h_db.get(pair, (0, 0))
        if total_matches == 0: return 0.5
        winrate = (p1_wins_hist / total_matches) if p1_id < p2_id else ((total_matches - p1_wins_hist) / total_matches)
        return winrate
    df['h2h_winrate'] = df.apply(get_h2h_winrate, axis=1)

    def get_form(player_id):
        hist = form_db.get(player_id, deque(maxlen=10))
        return sum(hist) / len(hist) if hist else 0.5
    df['form_diff'] = df['player1_id'].map(get_form) - df['player2_id'].map(get_form)

    def get_fatigue_diff(row):
        match_date = row['tourney_date']
        thirty_days_ago = match_date - datetime.timedelta(days=30)
        p1_fatigue = sum(1 for date in fatigue_db.get(row['player1_id'], []) if date > thirty_days_ago)
        p2_fatigue = sum(1 for date in fatigue_db.get(row['player2_id'], []) if date > thirty_days_ago)
        return p1_fatigue - p2_fatigue
    df['fatigue_diff'] = df.apply(get_fatigue_diff, axis=1)

    return df

# ============================================
# âœ… 4. Apply Features + Encode + Predict
# ============================================
df_pred = add_engineered_features(df_new.copy())

for col, encoder in encoders.items():
    if col == 'hand':
        df_pred['p1_hand'] = encoder.transform(df_pred['p1_hand'])
        df_pred['p2_hand'] = encoder.transform(df_pred['p2_hand'])
    elif col in df_pred.columns:
        df_pred[col] = encoder.transform(df_pred[col])

X_pred = df_pred[trained_columns]
probs = model.predict_proba(X_pred)
df_new['p1_win_probability'] = probs[:, 1]
df_new['predicted_winner_name'] = np.where(df_new['p1_win_probability'] > 0.5,
                                           df_new['player1_name'], df_new['player2_name'])

# ============================================
# âœ… 5. Final Output
# ============================================
print("\n\nðŸŽ¾ Predicted Results:")
print(df_new[['player1_name', 'player2_name', 'predicted_winner_name', 'p1_win_probability']].round(3))
