In [21]:
#!/Users/jim/anaconda3/envs/sb/bin/python

import pandas as pd
import numpy as np
import duckdb as ddb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime as dt

In [4]:
this_year = dt.now().year
this_year_tour = pd.read_csv(f'/Users/jim/dev/sb/tennis_stats/tennis_atp/atp_matches_{this_year}.csv')
last_year_tour = pd.read_csv(f'/Users/jim/dev/sb/tennis_stats/tennis_atp/atp_matches_{this_year-1}.csv')
this_year_chall = pd.read_csv(f'/Users/jim/dev/sb/tennis_stats/tennis_atp/atp_matches_qual_chall_{this_year}.csv')
last_year_chall = pd.read_csv(f'/Users/jim/dev/sb/tennis_stats/tennis_atp/atp_matches_qual_chall_{this_year-1}.csv')

In [113]:
all_stats = pd.concat([this_year_tour,last_year_tour,this_year_chall,last_year_chall],axis=0)
all_stats['tourney_date'] = pd.to_datetime(all_stats['tourney_date'],format='%Y%m%d')
all_stats.columns.to_list()
# display(all_stats)

['tourney_id',
 'tourney_name',
 'surface',
 'draw_size',
 'tourney_level',
 'tourney_date',
 'match_num',
 'winner_id',
 'winner_seed',
 'winner_entry',
 'winner_name',
 'winner_hand',
 'winner_ht',
 'winner_ioc',
 'winner_age',
 'loser_id',
 'loser_seed',
 'loser_entry',
 'loser_name',
 'loser_hand',
 'loser_ht',
 'loser_ioc',
 'loser_age',
 'score',
 'best_of',
 'round',
 'minutes',
 'w_ace',
 'w_df',
 'w_svpt',
 'w_1stIn',
 'w_1stWon',
 'w_2ndWon',
 'w_SvGms',
 'w_bpSaved',
 'w_bpFaced',
 'l_ace',
 'l_df',
 'l_svpt',
 'l_1stIn',
 'l_1stWon',
 'l_2ndWon',
 'l_SvGms',
 'l_bpSaved',
 'l_bpFaced',
 'winner_rank',
 'winner_rank_points',
 'loser_rank',
 'loser_rank_points']

In [116]:
h2h_q = """
select
  winner_id,
  winner_name,
  loser_id,
  loser_name,
  tourney_date,
  sum(case when tourney_level != 'C' then 1 else 0 end) as h2h_wins
from all_stats
group by 1,2,3,4,5
order by h2h_wins desc
"""
h2h = ddb.query(h2h_q).to_df()
display(h2h)

Unnamed: 0,winner_id,winner_name,loser_id,loser_name,tourney_date,h2h_wins
0,208010,Pablo Llamas Ruiz,122554,David Jorda Sanchis,2024-04-01,2.0
1,106298,Lucas Pouille,200514,Jurij Rodionov,2023-05-29,2.0
2,106220,Dimitar Kuzmanov,126409,Hugo Grenier,2023-04-03,2.0
3,132686,Nuno Borges,105449,Steve Johnson,2023-02-13,2.0
4,105777,Grigor Dimitrov,208029,Holger Rune,2024-01-01,1.0
...,...,...,...,...,...,...
19573,144645,Zdenek Kolar,207231,Mathys Erhard,2023-11-27,0.0
19574,200416,August Holmgren,132999,Kai Wehnelt,2023-11-27,0.0
19575,106227,Ji Sung Nam,106254,Calum Puttergill,2023-11-27,0.0
19576,208389,James Trotter,132054,Nathan Ponwith,2023-11-27,0.0


In [115]:
diffs_q = """
with data as (
select 
  winner_id as player_id,
  winner_name as player_name,
  surface,
  tourney_id,
  tourney_date,
  'wins' as type,
  count(*) as matches,
  sum(w_ace) as aces,
  sum(w_df) as double_faults,
  sum(w_svpt) as serve_points,
  sum(w_1stIn) as first_in,
  sum(w_1stWon) as first_won,
  sum(w_2ndWon) as second_won,
  sum(w_SvGms) as serve_games,
  sum(w_bpSaved) as bp_saved,
  sum(w_bpFaced) as bp_faced
from all_stats
group by 1,2,3,4,5,6

union all

select
  loser_id as player_id,
  loser_name as player_name,
  surface,
  tourney_id,
  tourney_date,
  'loss' as type,
  count(*) as matches,
  sum(l_ace) as aces,
  sum(l_df) as double_faults,
  sum(l_svpt) as serve_points,
  sum(l_1stIn) as first_in,
  sum(l_1stWon) as first_won,
  sum(l_2ndWon) as second_won,
  sum(l_SvGms) as serve_games,
  sum(l_bpSaved) as bp_saved,
  sum(l_bpFaced) as bp_faced
from all_stats as p
group by 1,2,3,4,5,6
)
, players as (
select
  player_id,
  player_name,
  surface,
  tourney_id,
  tourney_date,
  type,
  sum(matches) as matches,
  sum(aces) as aces,
  sum(double_faults) as double_faults,
  sum(serve_points) as serve_points,
  sum(first_in) as first_in,
  sum(first_won) as first_won,
  sum(second_won) as second_won,
  sum(serve_games) as serve_games,
  sum(bp_saved) as bp_saved,
  sum(bp_faced) as bp_faced
from data as p
group by 1,2,3,4,5,6
)
select * from players
order by 1,2,3,4
"""
diffs = ddb.query(diffs_q).to_df()
display(diffs)

Unnamed: 0,player_id,player_name,surface,tourney_id,tourney_date,type,matches,aces,double_faults,serve_points,first_in,first_won,second_won,serve_games,bp_saved,bp_faced
0,100644,Alexander Zverev,Clay,2023-0308,2023-04-17,loss,1.0,9.0,5.0,68.0,39.0,29.0,12.0,11.0,3.0,6.0
1,100644,Alexander Zverev,Clay,2023-0316,2023-07-17,loss,1.0,1.0,3.0,39.0,23.0,18.0,5.0,8.0,2.0,5.0
2,100644,Alexander Zverev,Clay,2023-0316,2023-07-17,wins,2.0,8.0,6.0,131.0,99.0,76.0,12.0,22.0,5.0,8.0
3,100644,Alexander Zverev,Clay,2023-0322,2023-05-22,wins,2.0,7.0,2.0,54.0,42.0,36.0,6.0,10.0,1.0,1.0
4,100644,Alexander Zverev,Clay,2023-0322,2023-05-22,loss,1.0,6.0,5.0,65.0,49.0,37.0,6.0,10.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29440,212803,Yun Seong Chung,Hard,2024-1741,2024-04-08,wins,2.0,6.0,5.0,142.0,85.0,58.0,33.0,23.0,3.0,8.0
29441,212803,Yun Seong Chung,Hard,2024-1741,2024-04-08,loss,1.0,2.0,2.0,58.0,34.0,21.0,9.0,8.0,5.0,9.0
29442,212803,Yun Seong Chung,Hard,2024-2278,2024-05-13,loss,1.0,5.0,2.0,63.0,40.0,26.0,14.0,11.0,4.0,6.0
29443,212803,Yun Seong Chung,Hard,2024-7490,2024-04-15,wins,1.0,3.0,3.0,87.0,44.0,34.0,25.0,15.0,4.0,6.0


In [None]:
base = """
with base as (
select
  a.*
  , hw.h2h_wins
  , hl.h2h_losses
from all_stats as a
join h2h as hw on 
  a.winner_id = hw.winner_id and
  a.tourney_date < hw.tourney_date
)
, 
)"""

In [None]:

# Feature engineering: Create more features based on player statistics
data['rank_diff'] = data['player1_rank'] - data['player2_rank']
data['win_diff'] = data['player1_win'] - data['player2_win']
data['surface_win_diff'] = data['player1_surface_win'] - data['player2_surface_win']
data['recent_form_diff'] = data['player1_recent_form'] - data['player2_recent_form'] # matches won on surface / total matches on surface over last 6 months
data['head_to_head_diff'] = data['player1_head_to_head'] - data['player2_head_to_head']
data['break_points_saved_diff'] = data['player1_bp_saved'] - data['player2_bp_saved']

# Select features and target variable
features = [
    'rank_diff', 'win_diff', 'surface_win_diff', 'recent_form_diff', 
    'head_to_head_diff', 'break_points_generated_diff', 'break_points_converted_diff'
]
X = data[features]
y = data['outcome']  # Assuming 'outcome' is the target variable where 1 means player1 wins, 0 means player2 wins

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


In [None]:
# Load the tournament bracket dataset
tournament = pd.read_csv('tournament_bracket.csv')

# Function to predict the winner of a match
def predict_winner(player1_stats, player2_stats):
    match_features = [[
        player1_stats['rank'] - player2_stats['rank'],
        player1_stats['win'] - player2_stats['win'],
        player1_stats['surface_win'] - player2_stats['surface_win'],
        player1_stats['recent_form'] - player2_stats['recent_form'],
        player1_stats['head_to_head'] - player2_stats['head_to_head'],
        player1_stats['break_points_generated'] - player2_stats['break_points_generated'],
        player1_stats['break_points_converted'] - player2_stats['break_points_converted']
    ]]
    match_features_scaled = scaler.transform(match_features)
    outcome_prob = model.predict_proba(match_features_scaled)
    return 1 if outcome_prob[0][1] > 0.5 else 0

# Function to simulate a round
def simulate_round(matches):
    next_round = []
    for match in matches:
        player1, player2, player1_stats, player2_stats = match
        winner = player1 if predict_winner(player1_stats, player2_stats) else player2
        winner_stats = player1_stats if winner == player1 else player2_stats
        next_round.append((winner, winner_stats))
    return next_round

# Function to simulate the tournament
def simulate_tournament(tournament):
    matches = [
        (row['player1'], row['player2'],
         {'rank': row['player1_rank'], 'win': row['player1_win'], 'surface_win': row['player1_surface_win'], 'recent_form': row['player1_recent_form'], 'head_to_head': row['player1_head_to_head'], 'break_points_generated': row['player1_break_points_generated'], 'break_points_converted': row['player1_break_points_converted']},
         {'rank': row['player2_rank'], 'win': row['player2_win'], 'surface_win': row['player2_surface_win'], 'recent_form': row['player2_recent_form'], 'head_to_head': row['player2_head_to_head'], 'break_points_generated': row['player2_break_points_generated'], 'break_points_converted': row['player2_break_points_converted']})
        for index, row in tournament.iterrows()
    ]

    semifinalists = []
    finalists = []

    while len(matches) > 1:
        if len(matches) == 2:
            finalists = matches
        elif len(matches) == 4:
            semifinalists = matches

        winners = simulate_round(matches)
        matches = [(winners[i][0], winners[i+1][0], winners[i][1], winners[i+1][1]) for i in range(0, len(winners), 2)]

    champion = matches[0][0]
    runner_up = finalists[0][0] if finalists[0][0] != champion else finalists[1][0]

    # Determine 3rd and 4th places based on semifinal losers
    third_place_match = [(semifinalists[0][0], semifinalists[1][0])] if semifinalists[0][0] != champion and semifinalists[1][0] != champion else [(semifinalists[0][1], semifinalists[1][1])]
    third_place_winner = third_place_match[0][0] if predict_winner(third_place_match[0][1], third_place_match[0][2]) else third_place_match[0][1]

    third_place = third_place_winner
    fourth_place = third_place_match[0][0] if third_place_winner == third_place_match[0][1] else third_place_match[0][1]

    return champion, runner_up, third_place, fourth_place

# Run multiple simulations
def run_simulations(tournament, num_simulations):
    results = {'1st': {}, '2nd': {}, '3rd': {}, '4th': {}}
    for _ in range(num_simulations):
        champion, runner_up, third_place, fourth_place = simulate_tournament(tournament)
        if champion in results['1st']:
            results['1st'][champion] += 1
        else:
            results['1st'][champion] = 1

        if runner_up in results['2nd']:
            results['2nd'][runner_up] += 1
        else:
            results['2nd'][runner_up] = 1

        if third_place in results['3rd']:
            results['3rd'][third_place] += 1
        else:
            results['3rd'][third_place] = 1

        if fourth_place in results['4th']:
            results['4th'][fourth_place] += 1
        else:
            results['4th'][fourth_place] = 1

    # Calculate probabilities
    for rank in results:
        for player in results[rank]:
            results[rank][player] /= num_simulations

    return results

# Number of simulations
num_simulations = 1000

# Run the simulations and get the probabilistic outcomes
probabilistic_outcomes = run_simulations(tournament, num_simulations)

# Print the probabilistic outcomes
for rank in probabilistic_outcomes:
    print(f"{rank} Place Probabilities:")
    for player, probability in probabilistic_outcomes[rank].items():
        print(f"Player: {player}, Probability: {probability:.2f}")
