In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Career Statistics

To prepare for match prediction, I will work on the subset of features that can be calculated prior to the match, or are known prior to the match. To increase the number of available features, I will calculate career statistics, statistics from the year prior to each match, and statistics from a recent window of matches.

In [120]:
matches = pd.read_pickle('../Resources/tennis_clean/atp_top_100_matches.pkl')

In [121]:
COLUMNS = ['aces', 'double_faults',
           'points', 'service_points', 'return_points', 
           'first_serves', 'second_serves', 'first_serves_won', 'second_serves_won',
           'break_points_faced', 'break_points_saved', 
           'opponent_break_points_faced', 'break_points_won',
           'first_serve_return_points_won', 'second_serve_return_points_won', 
           'opponent_service_points', 'opponent_first_serves',
           'service_points_won', 'return_points_won', 'points_won']

In [122]:
matches = matches[['tournament_date', 'opponent_id', 'surface', 'result_value',
                   'player_rank', 'player_ranking_points', 'player_age',
                   'opponent_rank', 'opponent_ranking_points', 'opponent_age'] + COLUMNS].copy()
matches.reset_index(inplace=True)

In [123]:
CAREER_COLUMNS = ['career_aces', 'career_double_faults',
           'career_points', 'career_service_points', 'career_return_points', 
           'career_first_serves', 'career_second_serves',
           'career_first_serves_won', 'career_second_serves_won',
           'career_break_points_faced', 'career_break_points_saved', 
           'career_opponent_break_points_faced', 'career_break_points_won',
           'career_first_serve_return_points_won', 'career_second_serve_return_points_won', 
           'career_opponent_service_points', 'career_opponent_first_serves',
           'career_service_points_won', 'career_return_points_won', 'career_points_won']

EWMA_10_COLUMNS = ['ewma_10_aces', 'ewma_10_double_faults',
           'ewma_10_points', 'ewma_10_service_points', 'ewma_10_return_points', 
           'ewma_10_first_serves', 'ewma_10_second_serves', 
           'ewma_10_first_serves_won', 'ewma_10_second_serves_won',
           'ewma_10_break_points_faced', 'ewma_10_break_points_saved', 
           'ewma_10_opponent_break_points_faced', 'ewma_10_break_points_won',
           'ewma_10_first_serve_return_points_won', 'ewma_10_second_serve_return_points_won', 
           'ewma_10_opponent_service_points', 'ewma_10_opponent_first_serves',
           'ewma_10_service_points_won', 'ewma_10_return_points_won', 'ewma_10_points_won']

EWMA_50_COLUMNS = ['ewma_50_aces', 'ewma_50_double_faults',
           'ewma_50_points', 'ewma_50_service_points', 'ewma_50_return_points', 
           'ewma_50_first_serves', 'ewma_50_second_serves',
           'ewma_50_first_serves_won', 'ewma_50_second_serves_won',
           'ewma_50_break_points_faced', 'ewma_50_break_points_saved', 
           'ewma_50_opponent_break_points_faced', 'ewma_50_break_points_won',
           'ewma_50_first_serve_return_points_won', 'ewma_50_second_serve_return_points_won', 
           'ewma_50_opponent_service_points', 'ewma_50_opponent_first_serves',
           'ewma_50_service_points_won', 'ewma_50_return_points_won', 'ewma_50_points_won']

In [124]:
matches.drop_duplicates(subset=['player_id', 'tournament_id', 'match_id'], inplace=True)
matches.drop_duplicates(subset=['player_id', 'tournament_id', 'opponent_id'], inplace=True)
matches.set_index(['player_id', 'tournament_id', 'match_id'], inplace=True)
matches.sort_index(inplace=True)

In [125]:
expanding_sum = (matches.reset_index('player_id')
                 .groupby('player_id')[COLUMNS]
                 .expanding().sum().shift()).dropna()

In [126]:
short_term_weighted_moving_average = (matches.reset_index('player_id')
                 .groupby('player_id')[COLUMNS]
                 .ewm(halflife=10).mean().shift()).dropna()

In [127]:
long_term_weighted_moving_average = (matches.reset_index('player_id')
                 .groupby('player_id')[COLUMNS]
                 .ewm(halflife=50).mean().shift()).dropna()

In [128]:
expanding_sum.columns = CAREER_COLUMNS
short_term_weighted_moving_average.columns = EWMA_10_COLUMNS
long_term_weighted_moving_average.columns = EWMA_50_COLUMNS

In [129]:
matches = matches.merge(expanding_sum, left_index=True, right_index=True, how='left').dropna()
matches = matches.merge(short_term_weighted_moving_average, left_index=True, right_index=True, how='left').dropna()
matches = matches.merge(long_term_weighted_moving_average, left_index=True, right_index=True, how='left').dropna()

In [130]:
matches['career_first_serve_percentage'] = matches.career_first_serves / matches.career_service_points

matches['career_first_serves_won_percentage'] = matches.career_first_serves_won / matches.career_first_serves

matches['career_second_serves_won_percentage'] = matches.career_second_serves_won / matches.career_second_serves

matches['career_service_points_won_percentage'] = matches.career_service_points_won / matches.career_service_points

matches['career_break_points_saved_percentage'] = (matches.career_break_points_saved / 
                                                   matches.career_break_points_faced)

matches['career_break_points_won_percentage'] = (matches.career_break_points_won / 
                                                 matches.career_opponent_break_points_faced)

matches['career_first_serve_return_points_won_percentage'] = (matches.career_first_serve_return_points_won / 
                                                              matches.career_opponent_first_serves)

matches['career_second_serve_return_points_won_percentage'] = (matches.career_second_serve_return_points_won / 
                                                               (matches.career_opponent_service_points - 
                                                                matches.career_opponent_first_serves))

matches['career_return_points_won_percentage'] = matches.career_return_points_won / matches.career_return_points

matches['career_points_won_percentage'] = matches.career_points_won / matches.career_points

matches['career_ace_rate'] = matches.career_aces / matches.career_service_points

matches['career_double_fault_rate'] = matches.career_double_faults / matches.career_service_points

In [131]:
matches['ewma_10_first_serve_percentage'] = matches.ewma_10_first_serves / matches.ewma_10_service_points

matches['ewma_10_first_serves_won_percentage'] = matches.ewma_10_first_serves_won / matches.ewma_10_first_serves

matches['ewma_10_second_serves_won_percentage'] = matches.ewma_10_second_serves_won / matches.ewma_10_second_serves

matches['ewma_10_service_points_won_percentage'] = matches.ewma_10_service_points_won / matches.ewma_10_service_points

matches['ewma_10_break_points_saved_percentage'] = (matches.ewma_10_break_points_saved / 
                                                   matches.ewma_10_break_points_faced)

matches['ewma_10_break_points_won_percentage'] = (matches.ewma_10_break_points_won / 
                                                 matches.ewma_10_opponent_break_points_faced)

matches['ewma_10_first_serve_return_points_won_percentage'] = (matches.ewma_10_first_serve_return_points_won / 
                                                              matches.ewma_10_opponent_first_serves)

matches['ewma_10_second_serve_return_points_won_percentage'] = (matches.ewma_10_second_serve_return_points_won / 
                                                               (matches.ewma_10_opponent_service_points - 
                                                                matches.ewma_10_opponent_first_serves))

matches['ewma_10_return_points_won_percentage'] = matches.ewma_10_return_points_won / matches.ewma_10_return_points

matches['ewma_10_points_won_percentage'] = matches.ewma_10_points_won / matches.ewma_10_points

matches['ewma_10_ace_rate'] = matches.ewma_10_aces / matches.ewma_10_service_points

matches['ewma_10_double_fault_rate'] = matches.ewma_10_double_faults / matches.ewma_10_service_points

In [132]:
matches['ewma_50_first_serve_percentage'] = matches.ewma_50_first_serves / matches.ewma_50_service_points

matches['ewma_50_first_serves_won_percentage'] = matches.ewma_50_first_serves_won / matches.ewma_50_first_serves

matches['ewma_50_second_serves_won_percentage'] = matches.ewma_50_second_serves_won / matches.ewma_50_second_serves

matches['ewma_50_service_points_won_percentage'] = matches.ewma_50_service_points_won / matches.ewma_50_service_points

matches['ewma_50_break_points_saved_percentage'] = (matches.ewma_50_break_points_saved / 
                                                   matches.ewma_50_break_points_faced)

matches['ewma_50_break_points_won_percentage'] = (matches.ewma_50_break_points_won / 
                                                 matches.ewma_50_opponent_break_points_faced)

matches['ewma_50_first_serve_return_points_won_percentage'] = (matches.ewma_50_first_serve_return_points_won / 
                                                              matches.ewma_50_opponent_first_serves)

matches['ewma_50_second_serve_return_points_won_percentage'] = (matches.ewma_50_second_serve_return_points_won / 
                                                               (matches.ewma_50_opponent_service_points - 
                                                                matches.ewma_50_opponent_first_serves))

matches['ewma_50_return_points_won_percentage'] = matches.ewma_50_return_points_won / matches.ewma_50_return_points

matches['ewma_50_points_won_percentage'] = matches.ewma_50_points_won / matches.ewma_50_points

matches['ewma_50_ace_rate'] = matches.ewma_50_aces / matches.ewma_50_service_points

matches['ewma_50_double_fault_rate'] = matches.ewma_50_double_faults / matches.ewma_50_service_points

In [133]:
CAREER_COLUMNS = ['career_first_serve_percentage',
                   'career_first_serves_won_percentage',
                   'career_second_serves_won_percentage',
                   'career_service_points_won_percentage',
                   'career_break_points_saved_percentage',
                   'career_break_points_won_percentage',
                   'career_first_serve_return_points_won_percentage',
                   'career_second_serve_return_points_won_percentage',
                   'career_return_points_won_percentage',
                   'career_points_won_percentage',
                   'career_ace_rate',
                   'career_double_fault_rate']

EWMA_10_COLUMNS = ['ewma_10_first_serve_percentage',
                   'ewma_10_first_serves_won_percentage',
                   'ewma_10_second_serves_won_percentage',
                   'ewma_10_service_points_won_percentage',
                   'ewma_10_break_points_saved_percentage',
                   'ewma_10_break_points_won_percentage',
                   'ewma_10_first_serve_return_points_won_percentage',
                   'ewma_10_second_serve_return_points_won_percentage',
                   'ewma_10_return_points_won_percentage',
                   'ewma_10_points_won_percentage',
                   'ewma_10_ace_rate',
                   'ewma_10_double_fault_rate']

EWMA_50_COLUMNS = ['ewma_50_first_serve_percentage',
                   'ewma_50_first_serves_won_percentage',
                   'ewma_50_second_serves_won_percentage',
                   'ewma_50_service_points_won_percentage',
                   'ewma_50_break_points_saved_percentage',
                   'ewma_50_break_points_won_percentage',
                   'ewma_50_first_serve_return_points_won_percentage',
                   'ewma_50_second_serve_return_points_won_percentage',
                   'ewma_50_return_points_won_percentage',
                   'ewma_50_points_won_percentage',
                   'ewma_50_ace_rate',
                   'ewma_50_double_fault_rate']

In [134]:
opponent_expanding_sum = matches.reset_index().set_index(['opponent_id', 'tournament_date', 'match_id'])[CAREER_COLUMNS]
opponent_short_term_weighted_moving_average = matches.reset_index().set_index(['opponent_id', 'tournament_date', 'match_id'])[EWMA_10_COLUMNS]
opponent_long_term_weighted_moving_average = matches.reset_index().set_index(['opponent_id', 'tournament_date', 'match_id'])[EWMA_50_COLUMNS]

In [135]:
opponent_expanding_sum.sort_index(inplace=True)
opponent_short_term_weighted_moving_average.sort_index(inplace=True)
opponent_long_term_weighted_moving_average.sort_index(inplace=True)

In [136]:
opponent_expanding_sum.rename_axis(['player_id', 'tournament_date', 'match_id'], inplace=True)
opponent_short_term_weighted_moving_average.rename_axis(['player_id', 'tournament_date', 'match_id'], inplace=True)
opponent_long_term_weighted_moving_average.rename_axis(['player_id', 'tournament_date', 'match_id'], inplace=True)

In [137]:
matches.reset_index(inplace=True)
matches.drop_duplicates(subset=['player_id', 'tournament_date', 'match_id'], inplace=True)
matches.set_index(['player_id', 'tournament_date', 'match_id'], inplace=True)

In [138]:
matches = matches.merge(opponent_expanding_sum, left_index=True, right_index=True, how='left',
                        suffixes=[None, '_opponent']).dropna()
matches = matches.merge(opponent_short_term_weighted_moving_average, left_index=True, right_index=True, how='left',
                        suffixes=[None, '_opponent']).dropna()
matches = matches.merge(opponent_long_term_weighted_moving_average, left_index=True, right_index=True, how='left',
                        suffixes=[None, '_opponent']).dropna()

In [139]:
matches['ranking_points_diff'] = matches.player_ranking_points - matches.opponent_ranking_points
matches['rank_diff'] = matches.opponent_rank - matches.player_rank

In [140]:
matches.to_pickle('../Resources/tennis_clean/atp_matches_full_feature.pkl')

In [111]:
head_to_head = matches.reset_index().set_index(['player_id', 'opponent_id', 'tournament_date']).sort_index()

In [112]:
head_to_head['result_value'] = head_to_head.result_value.apply(lambda x: -1 if x == 0 else x)

In [113]:
head_to_head = head_to_head.groupby(['player_id', 'opponent_id']).result_value.cumsum()   \
            .groupby(['player_id', 'opponent_id']).shift()                 \
            .reset_index('tournament_date').fillna(0)

In [116]:
len(set(top_100))

NameError: name 'top_100' is not defined

In [None]:
matches.reset_index(inplace=True)
top_100_matches = matches[matches.player_id.isin(top_100)]
matches.set_index('player_id', inplace=True)
top_100_matches.reset_index(inplace=True)
top_100_matches.set_index(['player_id', 'tournament_date', 'match_id'], inplace=True)
top_100_matches.sort_index(inplace=True)

In [None]:
top_100_matches.reset_index(inplace=True)
top_100_matches.set_index(['player_id', 'opponent_id', 'tournament_date'], inplace=True)
head_to_head.reset_index(inplace=True)
head_to_head.set_index(['player_id', 'opponent_id', 'tournament_date'], inplace=True)

In [None]:
top_100_matches['h2h'] = head_to_head

In [None]:
top_100_matches.reset_index(inplace=True)
top_100_matches.set_index(['player_id', 'tournament_date', 'match_id'], inplace=True)

In [None]:
top_100_matches = top_100_matches.dropna(subset=['opponent_id'])

In [None]:
top_100_matches['age_diff'] = (players.loc[top_100_matches.index.get_level_values(0)].birthdate.values - 
 players.loc[top_100_matches.opponent_id].birthdate.values)

In [None]:
top_100_matches['age_diff'] = top_100_matches.age_diff.dt.days

In [None]:
top_100_matches.to_csv('../Resources/tennis_clean/atp_top_100_matches.csv')

In [None]:
players.query("player_id == 105379")

In [None]:
top_100_matches.career_first_serve_percentage.hist()

In [115]:
players.query('name_last == \"Federer\"')

NameError: name 'players' is not defined