In [4]:
# Import packages
import http.client
import json
import pandas as pd
from datetime import datetime
import numpy as np
import time
import pickle

In [5]:
# Api keys
eda_api_key = "thp5rpgrbegb8hkc7zrapy36"
mds_api_key = "fqe4f9fyp8m84mg679xu7dnj"

# Variable to count api calls
global CALLS
CALLS = 0

# Make an API call given get, return a json object
def send_req(get, func):
    global CALLS
    # Connect to SR API
    conn = http.client.HTTPSConnection("api.sportradar.com")
    conn.request("GET", get)
    with conn.getresponse() as res:
        # Check if the response status is OK (200)
        if res.status == 200:
            data = res.read().decode("utf-8")
            
            # Check if data is not empty before decoding
            if data:
                CALLS += 1
                return json.loads(data)
            else:
                print("Error: Empty response data: " + func)
                return None
        else:
            print(f"Error: Failed request with status code {res.status}: {res.read().decode()}:" + func)
            return None

# Display json object neatly
def pp(json_obj):
    print(json.dumps(json_obj, indent=4))

In [6]:
with open('data/gs_matches_men.pkl', 'rb') as f:
    gs_matches_m = pickle.load(f)

with open('data/gs_matches_women.pkl', 'rb') as f:
    gs_matches_w = pickle.load(f)

In [7]:
# Query sport event timeline
def rq_sport_event_timeline(event, api_key):
    pre = "/tennis/trial/v3/en/sport_events/"
    post = "/timeline.json"
    get = pre+event+post+"?api_key="+api_key
    return send_req(get,"rq_sport_event_timeline")

# sport event summary
# def rq_sport_event_summary(event, api_key):
#     pre = "/tennis/trial/v3/en/sport_events/"
#     post = "/summary.json"
#     get = pre+event+post+"?api_key="+api_key
#     return send_req(get,"rq_sport_event_summary")

#summary0 = rq_sport_event_summary("sr:sport_event:39826407", eda_api_key)
timeline_m = rq_sport_event_timeline(gs_matches_m[0], eda_api_key)
timeline_w = rq_sport_event_timeline(gs_matches_w[1], eda_api_key)
#pp(timeline0)


In [92]:
pp(timeline_w)

{
    "generated_at": "2024-01-29T01:59:53+00:00",
    "sport_event": {
        "id": "sr:sport_event:42023447",
        "start_time": "2023-07-03T10:00:00+00:00",
        "start_time_confirmed": true,
        "sport_event_context": {
            "sport": {
                "id": "sr:sport:5",
                "name": "Tennis"
            },
            "category": {
                "id": "sr:category:6",
                "name": "WTA"
            },
            "competition": {
                "id": "sr:competition:2559",
                "name": "Wimbledon Women Singles",
                "parent_id": "sr:competition:2553",
                "type": "singles",
                "gender": "women",
                "level": "grand_slam"
            },
            "season": {
                "id": "sr:season:95309",
                "name": "Wimbledon Women Singles 2023",
                "start_date": "2023-06-26",
                "end_date": "2023-07-16",
                "year": "2023",
         

In [8]:
def str_to_time(str):
    return datetime.fromisoformat(str)


def play_by_play(timeline):

    # avg point length??

    # for each event in the timeline
    game_time = None
    game_lengths = []
    points_per_game = []
    in_game = False
    suspended = False
    suspended_time = 0
    in_game = 0
    for time in timeline['timeline']:

        if time['type']!='point':
            curr_time = str_to_time(time['time'])

            if in_game!=0: # game ended, add points to list
                points_per_game.append(in_game+1) # +1 for the winning point
                in_game = 0

            if time['type']=='match_suspended':
                suspended=True
                suspended_time = curr_time

            elif time['type']=='match_resumed':
                suspended=False
                suspended_time = (curr_time-suspended_time).total_seconds()/60
            

            # Compute game length
            elif time['type'] in ['period_score','period_start'] and suspended ==False: # start of a set or game
                if game_time!=None:
                    point_length = (curr_time-game_time).total_seconds()/60
                    if point_length < 20:
                        game_lengths.append(point_length)
                    else:
                        point_length = (curr_time-game_time).total_seconds()/60-suspended_time
                        game_lengths.append(point_length)

                game_time = curr_time # start of next step time

            continue
        else:
            in_game+=1

    sum = 0
    set_lengths = []
    for point in game_lengths:
        if point==0:
            set_lengths.append(sum)
            sum=0
        else:
            sum+=point

    game_lengths = [x for x in game_lengths if x!=0]

    return [np.mean(set_lengths), np.mean(game_lengths), np.mean(points_per_game)]

In [13]:
def get_event_stats(timeline):

    event_id = timeline['sport_event']['id']
    stats = [pd.DataFrame(),pd.DataFrame()]
    context = timeline['sport_event']['sport_event_context']

    for i in range(len(stats)):
        comp_info = timeline['statistics']['totals']['competitors'][i]
        stats_i = pd.json_normalize(comp_info['statistics'])
        stats_i.insert(0, 'event_id', event_id)
        stats_i.insert(1, 'player_id', comp_info['id'])
        stats_i.insert(2, 'name', comp_info['name'])
        try:
            seed = timeline['sport_event']['competitors'][i]['seed']
        except:
            seed = None
        stats_i.insert(3, 'seed', seed)
        # MAKE SURE 1 AND 2 MATCH HOME AND AWAY
        
        stats[i] = stats_i

    merged = pd.merge(stats[0], stats[1], how='outer', left_on = 'event_id', right_on='event_id', suffixes=("_home", "_away"))

    # start date of competitions
    merged['date'] = context['season']['start_date']
    #timeline['sport_event']['start_time']

    # Name of competition
    merged['competition'] = context['competition']['name']

    # Men or womens match
    merged['men'] = (context['competition']['gender']=="men")

    # Which round?
    merged['round'] = context['round']['name']

    # How many sets
    merged['best_of'] = context['mode']['best_of']

    # Who won?
    merged['win_1'] = timeline['sport_event_status']['home_score']>timeline['sport_event_status']['away_score']

    # Get play by play
    [avg_set_length, 
     avg_game_length, 
     avg_points_per_game] = play_by_play(timeline)

    merged["avg_set_length"] = avg_set_length
    merged["avg_game_length"] = avg_game_length
    merged["avg_points_per_game"] = avg_points_per_game

    # Getting scores
    i = 1
    for set_ in timeline['sport_event_status']['period_scores']:
        merged['set'+str(i)+"_diff"] = set_['home_score']-set_['away_score']
        merged['set'+str(i)+'_games'] = set_['home_score']+set_['away_score']
        i+=1

    return merged

In [17]:
stats0 = get_event_stats(timeline_m)
stats1 = get_event_stats(timeline_w)
stats_df = pd.concat([stats0, stats1]).reset_index()
stats_df

Unnamed: 0,index,event_id,player_id_home,name_home,seed_home,aces_home,backhand_errors_home,backhand_unforced_errors_home,backhand_winners_home,breakpoints_won_home,...,set1_diff,set1_games,set2_diff,set2_games,set3_diff,set3_games,set4_diff,set4_games,set5_diff,set5_games
0,0,sr:sport_event:42023937,sr:competitor:87690,"Thompson, Jordan",,16,29,13,7,3,...,-4,8,-4,8,2.0,10.0,1.0,13.0,3.0,9.0
1,0,sr:sport_event:42023447,sr:competitor:41355,"Bogdan, Ana",,6,12,7,7,3,...,1,13,1,13,,,,,,


In [96]:
# list to one big dataframe of match statistics
stats = []
for match in gs_matches_m[0:5]:
    timeline = rq_sport_event_timeline(match, eda_api_key)
    stats.append(get_event_stats(timeline))

stats_df = pd.concat(stats).reset_index()
stats_df

Unnamed: 0,index,event_id,player_id_home,name_home,seed_home,aces_home,backhand_errors_home,backhand_unforced_errors_home,backhand_winners_home,breakpoints_won_home,...,set1_diff,set1_games,set2_diff,set2_games,set3_diff,set3_games,set4_diff,set4_games,set5_diff,set5_games
0,0,sr:sport_event:42023937,sr:competitor:87690,"Thompson, Jordan",,16,29,13,7,3,...,-4,8,-4,8,2,10,1.0,13.0,3.0,9.0
1,0,sr:sport_event:42023949,sr:competitor:359602,"Musetti, Lorenzo",14.0,5,13,11,10,5,...,3,9,5,7,2,12,,,,
2,0,sr:sport_event:42023961,sr:competitor:257721,"Baez, Sebastian",,3,18,24,13,2,...,-1,13,3,9,-3,9,-1.0,13.0,,
3,0,sr:sport_event:42023963,sr:competitor:658475,"van Assche, Luca",,6,21,15,4,2,...,1,13,-2,10,-4,8,-2.0,10.0,,
4,0,sr:sport_event:42023965,sr:competitor:106755,"Rublev, Andrey",7.0,6,13,11,4,5,...,3,9,2,12,2,10,,,,


In [87]:
stats_df.to_csv("data/stats_df.csv", index=False)

In [None]:
stats_df = pd.read_csv("data/stats_df.csv")

In [None]:
# get competitors' stats for an individual game
all_stats = pd.DataFrame()
for comp_stats in timeline['statistics']['totals']['competitors']:
    stats = pd.json_normalize(comp_stats['statistics'])
    stats.insert(0, 'player_id', comp_stats['id'])
    stats.insert(1, 'name', comp_stats['name'])

    print(stats)

Break points

In [None]:

# Break points
# Is it a break point?
# if is_break_point(int(time['home_score']), int(time['away_score']), time['server']):
#    if int(time['home_score'])>int(time['away_score']):
#       break_point_home+=1
#    else:
#       break_point_away+=1

# def is_break_point(home_score, away_score, server):

#     if home_score==away_score: # score is tied
#         return False
#     elif (home_score>away_score and server=='home') or (away_score>home_score and server=='away'): # server is winning
#         return False
#     elif not (home_score==40 or away_score==40): # not at the end of the game
#         return False
#     else:
#         return True

# # Compute set length
# if time['type'] in ['period_start','match_ended'] and suspended==False: # beginning/ends of sets
#     if "period_name" in time and time['period_name']=='suspended':
#         suspended = True
#         continue # Neglecting any part of the game that follows a suspension (I think this can be fixed!!!!! but keep for now)

#     curr_time = str_to_time(time['time'])
#     if set_time != None:
#         set_lengths.append( (curr_time-set_time).total_seconds() /60) # subtracting two set times to get set length in minutes
#     set_time = curr_time # start of next step time