# Data Analysis Practice on 2021 data

## Brainstorming of Questions
1. Are there more break points in clay court matches?
2. 


In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from typing import Union

df = pd.read_csv('data/atp_matches_2021.csv')
df = df.astype({'tourney_date':'string'})
df.tourney_date = pd.to_datetime(df.tourney_date)
pd.set_option('display.max_columns', None)
df = df.sort_values(by =['tourney_date', 'match_num'])
df = df.reset_index()

## Dealing with the Score

The score for each set can be in one of 6 formats:
1. 6-x
2. x-6
3. 7-5
4. 5-7
5. 7-6(x)
6. (x)6-7

- From these formats, winner_games_won and loser_games_won can be calculated. Conveniently, winner's games is always quoted first (even if it goes to three sets) so once split these can just be calculated as such. 

- [Edit]: Inevitably encountered some alternative formats such as `4-6 6-3 [7-10]` which is when they had a first to 10 tie break to decide the match instead of a third set in some formats (usually doubles)

- Now I have number of games won for each player, paired with number of break points faced, number of break points saved and total number of service games, I can find how many service games they won and lost. 

In [69]:
def get_winner_games_won(score: str) -> int:
    """Takes in the score for the match and returns the number of games won by the winner

    Args:
        score (str): The score as a string 

    Returns:
        Int : The winners total games won
    """
    w_games = 0
    sets = score.split(' ')
    for set in sets:
        if 'R' in set or 'W' in set or 'Def.' in set or '[' in set:
            continue 
        if set[0] == '(':
            w_games += 6
            continue
        if set[-1] == ')':
            w_games += 7
            continue 
        games = set.split('-')
        w_games += int(games[0])
    return w_games

def get_loser_games_won(score: str) -> int:
    """Takes in the score for the match and returns the number of games won by the loser 

    Args:
        score (str): The score as a string 

    Returns:
        int: The losers total games won
    """
    l_games = 0
    sets = score.split(' ')
    for set in sets:
        if 'R' in set or 'W' in set or 'Def.' in set or '[' in set:
            continue 
        if set[0] == '(':
            l_games += 7
            continue
        if set[-1] == ')':
            l_games += 6
            continue 
        games = set.split('-')
        l_games += int(games[1])
    return l_games

In [70]:
df['w_games'] = df['score'].apply(lambda x: get_winner_games_won(x))
df['l_games'] = df['score'].apply(lambda x: get_loser_games_won(x))

In [71]:
df

Unnamed: 0,index,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,w_games,l_games
0,1158,2021-0499,Delray Beach,Hard,32,A,2021-01-04,271,106329,,,Thiago Monteiro,L,183.0,BRA,26.5,105064,,,Thomaz Bellucci,L,188.0,BRA,33.0,6-3 7-5,3,R32,100.0,2.0,1.0,67.0,38.0,28.0,16.0,10.0,8.0,9.0,3.0,3.0,69.0,42.0,25.0,12.0,11.0,2.0,6.0,84.0,820.0,281.0,185.0,13,8
1,1159,2021-0499,Delray Beach,Hard,32,A,2021-01-04,272,200624,,,Sebastian Korda,R,196.0,USA,20.5,126952,,,Soon Woo Kwon,R,180.0,KOR,23.0,6-4 6-4,3,R32,93.0,8.0,1.0,88.0,53.0,36.0,18.0,10.0,5.0,6.0,4.0,4.0,50.0,29.0,22.0,11.0,10.0,1.0,4.0,119.0,566.0,95.0,768.0,12,8
2,1160,2021-0499,Delray Beach,Hard,32,A,2021-01-04,273,126205,5.0,,Tommy Paul,R,185.0,USA,23.6,106227,,,Ji Sung Nam,R,183.0,KOR,27.3,6-1 6-4,3,R32,67.0,5.0,0.0,59.0,38.0,26.0,13.0,9.0,2.0,3.0,2.0,4.0,51.0,27.0,16.0,10.0,8.0,2.0,6.0,52.0,1080.0,268.0,205.0,12,5
3,1161,2021-0499,Delray Beach,Hard,32,A,2021-01-04,275,111815,,,Cameron Norrie,L,188.0,GBR,25.3,111574,,WC,Jc Aragone,R,178.0,USA,25.5,6-2 6-1,3,R32,67.0,3.0,2.0,48.0,31.0,26.0,10.0,8.0,3.0,3.0,2.0,2.0,47.0,28.0,13.0,7.0,7.0,4.0,8.0,74.0,877.0,299.0,165.0,12,3
4,1162,2021-0499,Delray Beach,Hard,32,A,2021-01-04,276,106216,,,Bjorn Fratangelo,R,183.0,USA,27.4,110536,,Q,Kevin King,U,190.0,USA,29.8,6-2 6-2,3,R32,69.0,0.0,0.0,51.0,25.0,22.0,15.0,8.0,2.0,2.0,2.0,2.0,49.0,32.0,17.0,7.0,8.0,2.0,6.0,274.0,195.0,304.0,154.0,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2728,2635,2021-M-DC-2021-FLS-M-CRO-SRB-01,Davis Cup Finals SF: CRO vs SRB,Hard,2,D,2021-12-03,2,104925,,,Novak Djokovic,R,188.0,SRB,34.5,105227,,,Marin Cilic,R,198.0,CRO,33.1,6-4 6-2,3,RR,99.0,4.0,1.0,60.0,40.0,30.0,12.0,9.0,8.0,8.0,6.0,3.0,66.0,37.0,23.0,13.0,9.0,5.0,8.0,1.0,11540.0,30.0,1710.0,12,6
2729,2642,2021-M-DC-2021-FLS-M-RTF-GER-01,Davis Cup Finals SF: RTF vs GER,Hard,2,D,2021-12-04,1,126094,,,Andrey Rublev,R,188.0,RUS,24.1,136440,,,Dominik Koepfer,L,180.0,GER,27.5,6-4 6-0,3,RR,48.0,9.0,2.0,43.0,32.0,27.0,6.0,8.0,0.0,0.0,2.0,2.0,43.0,21.0,12.0,10.0,8.0,0.0,4.0,5.0,5150.0,54.0,1101.0,12,4
2730,2643,2021-M-DC-2021-FLS-M-RTF-GER-01,Davis Cup Finals SF: RTF vs GER,Hard,2,D,2021-12-04,2,106421,,,Daniil Medvedev,R,198.0,RUS,25.7,105526,,,Jan Lennard Struff,R,193.0,GER,31.5,6-4 6-4,3,RR,66.0,7.0,0.0,50.0,35.0,29.0,12.0,10.0,1.0,1.0,5.0,4.0,54.0,34.0,29.0,7.0,10.0,3.0,5.0,2.0,8640.0,51.0,1134.0,12,8
2731,2640,2021-M-DC-2021-FLS-M-RTF-CRO-01,Davis Cup Finals F: RTF vs CRO,Hard,2,D,2021-12-05,1,126094,,,Andrey Rublev,R,188.0,RUS,24.1,127339,,,Borna Gojo,R,196.0,CRO,23.7,6-4 7-6(5),3,RR,92.0,8.0,2.0,57.0,39.0,36.0,12.0,11.0,0.0,0.0,15.0,2.0,83.0,52.0,33.0,20.0,11.0,7.0,8.0,5.0,5150.0,279.0,196.0,13,10
