In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import calendar
from IPython.display import display, Markdown

In [2]:
# set number of rows to display 
pd.set_option('display.max_rows',100)

In [3]:
# import data
df_raw = pd.read_csv("../data/atp_tennis.csv",
                     parse_dates=['Date'], # convert the date to datetime
                     dtype={'Tournament':'string',
                            'Series':'string',
                            'Court':'string',
                            'Surface':'string',
                            'Round':'string',
                            'Player_1':'string',
                            'Player_2':'string',
                            'Winner':'string',
                            'Score':'string'
                           }
                    )

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60729 entries, 0 to 60728
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Tournament  60729 non-null  string        
 1   Date        60729 non-null  datetime64[ns]
 2   Series      60729 non-null  string        
 3   Court       60729 non-null  string        
 4   Surface     60729 non-null  string        
 5   Round       60729 non-null  string        
 6   Best of     60729 non-null  int64         
 7   Player_1    60729 non-null  string        
 8   Player_2    60729 non-null  string        
 9   Winner      60729 non-null  string        
 10  Rank_1      60729 non-null  int64         
 11  Rank_2      60729 non-null  int64         
 12  Pts_1       60729 non-null  int64         
 13  Pts_2       60729 non-null  int64         
 14  Odd_1       60729 non-null  float64       
 15  Odd_2       60729 non-null  float64       
 16  Score       60729 non-

### Data Preparation

In [5]:
# create new columns by parsing the match date
df_raw['Year'] = df_raw['Date'].dt.year
df_raw['Month_Number'] = df_raw['Date'].dt.month
df_raw['Day'] = df_raw['Date'].dt.day

In [6]:
# convert numeric months to names
df_raw['Month'] = df_raw['Month_Number'].apply(lambda x:calendar.month_abbr[x])

In [7]:
# create an empty Country column
df_raw['Country'] = np.nan

# Australia
df_raw.loc[df_raw['Tournament'].isin(['Adelaide International 1',
                                      'Adelaide International 2',
                                      'Melbourne Summer Set',
                                      'Sydney Tennis Classic',
                                      'Australian Open',
                                      'ASB Classic',
                                      'Murray River Open'
                                      ]),'Country'] = 'Australia'

# South America
df_raw.loc[df_raw['Tournament'].isin(['Argentina Open',
                                      'Cordoba Open',
                                      'Chile Open',
                                      'Rio Open']),'Country'] = 'South America'

# US
df_raw.loc[df_raw['Tournament'].isin(['Atlanta Open',
                                      'Dallas Open',
                                      'Delray Beach Open',
                                      'Hall of Fame Championships',
                                      'San Diego Open',
                                      'U.S. Men\'s Clay Court Championships',
                                      'Winston-Salem Open at Wake Forest University',
                                      'Citi Open',
                                      'US Open',
                                      'BNP Paribas Open',
                                      'Miami Open',
                                      'Western & Southern Financial Group Masters',
                                      'BB&T Atlanta Open'
                                      ])
           ,'Country'] = 'US'
# Europe
df_raw.loc[df_raw['Tournament'].isin(['BMW Open',
                                      'Croatia Open',
                                      'Eastbourne International',
                                      'Eastbourne International',
                                      'European Open',
                                      'Firenze Open',
                                      'Generali Open',
                                      'Geneva Open',
                                      'Gijon Open',
                                      'Lyon Open',
                                      'Mallorca Championships',
                                      'Mercedes Cup',
                                      'Millennium Estoril Open',
                                      'Napoli Cup',
                                      'Nordea Open',
                                      'Open 13',
                                      'Open Sud de France',
                                      'Open de Moselle',
                                      'Rosmalen Grass Court Championships',
                                      'Serbia Open',
                                      'Sofia Open',
                                      'Stockholm Open',
                                      'Suisse Open Gstaad',
                                      'ABN AMRO World Tennis Tournament',
                                      'Barcelona Open',
                                      'Halle Open',
                                      'Queen\'s Club Championships',
                                      'Swiss Indoors',
                                      'Vienna Open',
                                      'French Open',
                                      'Wimbledon',
                                      'BNP Paribas Masters',
                                      'Internazionali BNL d\'Italia',
                                      'Monte Carlo Masters',
                                      'Mutua Madrid Open',
                                      'Masters Cup',
                                      'Srpska Open',
                                      'bett1HULKS Championship',
                                      'Viking International',
                                      'AnyTech365 Andalucia Open',
                                      'Belgrade Open',
                                      'Kremlin Cup',
                                      'St. Petersburg Open',
                                      'Emilia-Romagna Open',
                                      'Sardegna Open',
                                      'Stuttgart Open'
                                     ])
           ,'Country'] = 'Europe'

# MENA
df_raw.loc[df_raw['Tournament'].isin(['Grand Prix Hassan II',
                                     'Qatar Exxon Mobil Open',
                                     'Tel Aviv Open',
                                     'Dubai Tennis Championships',
                                     'Antalya Open'])
           ,'Country'] = 'MENA'

# Asia
df_raw.loc[df_raw['Tournament'].isin(['Korea Open',
                                     'Maharashtra Open',
                                     'Rakuten Japan Open Tennis Championships',
                                     'Singapore Open'])
           ,'Country'] = 'Asia'

# Mexico
df_raw.loc[df_raw['Tournament'].isin(['Los Cabos Open',
                                      'Abierto Mexicano'])
           ,'Country'] = 'Mexico'
# Kazakhstan
df_raw.loc[df_raw['Tournament'].isin(['Astana Open'])
           ,'Country'] = 'Kazakhstan'

# Canada
df_raw.loc[df_raw['Tournament'].isin(['Canadian Open'])
           ,'Country'] = 'Canada'

In [8]:
# function to assign colors to regions/countries
def set_color(x):
    if(x == 'Australia'):
        return "white"
    elif(x == 'South America'):
        return "blue"
    elif(x == 'Mexico'):
        return "brown"
    elif(x == "Kazakhstan"):
        return "purple"
    elif(x == "Mexico"):
        return "black"
    elif(x == 'MENA'):
        return "yellow"
    elif(x == 'US'):
        return "red"
    elif(x == 'Europe'):
        return "orange"
    elif(x == 'Asia'):
        return "green"
    elif(x == 'Canada'):
        return "light blue"

### Data Visualization

In [9]:
# extract data for a given player
df_player = df_raw[(df_raw['Player_1'] == 'Rune H.') | (df_raw['Player_2'] == 'Rune H.')][['Year','Month','Day','Series','Tournament','Country']]

In [10]:
df_player[df_player['Year']==2022]

Unnamed: 0,Year,Month,Day,Series,Tournament,Country
56414,2022,Jan,4,ATP250,Adelaide International 1,Australia
56514,2022,Jan,17,Grand Slam,Australian Open,Australia
56642,2022,Feb,1,ATP250,Cordoba Open,South America
56725,2022,Feb,8,ATP250,Argentina Open,South America
56859,2022,Feb,15,ATP250,Open 13,Europe
56867,2022,Feb,16,ATP250,Open 13,Europe
56978,2022,Feb,22,ATP250,Chile Open,South America
57019,2022,Mar,12,Masters 1000,BNP Paribas Open,US
57052,2022,Mar,13,Masters 1000,BNP Paribas Open,US
57255,2022,Apr,12,Masters 1000,Monte Carlo Masters,Europe


In [11]:
# sanity check to make sure there are no unassigned countries
df_player[df_player['Year'].isin([2021,2022,2023]) & df_player['Country'].isna()]

Unnamed: 0,Year,Month,Day,Series,Tournament,Country


In [33]:
def player_stats(player1):
    
    # extract data for a given player
    df_player_1 = df_raw[(df_raw['Player_1'] == player1)|(df_raw['Player_2'] == player1)] \
                [['Year','Month','Day','Series','Tournament','Country','Month_Number']]
    
    #print(df_player_1)
    
    # remove dups and sort
    df_player_1 = df_player_1.drop_duplicates().sort_values(by=['Year','Month_Number','Series'])
    
    # years to visualize
    atp_years = [2022]
    
    for year in atp_years:
        # display year
        # display(Markdown('### Year {}'.format(year)))
        # extract data for the player for the iterated year
        df_player1_year = df_player_1[df_player_1['Year'] == year].drop(columns=['Year'])
        
        # numerate rows based on a grouping and sorting. Similar to ROW_NUMBER()
        df_player1_year['RowNum'] = df_player1_year.sort_values(['Month_Number','Day'], ascending=[True,True]) \
             .groupby(['Tournament']) \
             .cumcount() + 1
        
        # drop rows for the same tournament
        df_player1_year.drop(df_player1_year[df_player1_year['RowNum'] > 1].index,inplace=True)

        # sort data
        df_player1_year.sort_values(by=['Month_Number','Day', 'Series'], inplace=True)
        
        #print(df_player1_year)
        
        # create a line graph
        fig_atp_attendance = px.line(
            df_player1_year,
            x = 'Month',
            y = 'Series',
            markers = True,
            category_orders={'Series':['Grand Slam','Masters 1000','ATP500','ATP250'],
                             'Month':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
                                       }
        )        

        fig_atp_attendance.update_layout(
            font_family = "Outfit",
            font_size = 13,
            plot_bgcolor = '#f8f9fa',            
            xaxis_title = None,               # remove X title
            yaxis_title = None,               # remove Y title
            title_font_color = "#2f3061",
            margin=dict(t=100)
        )
        
        # update player 1 line
        fig_atp_attendance.update_traces(
            patch={"line":{
                            "width": 3,
                            "color" : "#ea3788"}
            }
        )
        
        fig_atp_attendance.update_xaxes(
            ticks="outside",                    # place ticks outside X-axis
            ticklen=5,                          # ticks length 
            tickcolor = '#adb5bd',              # ticks color
            linecolor = '#adb5bd',              # axis line color
            color = "#2f3061",                  # axis label color
            showgrid=False,                     # hide vertical background grid
            
        )
        
        fig_atp_attendance.update_yaxes(
            shift= -10,                         # shift ticks left by 10 pixels
            anchor="free",                      # allows axis repositioning
            gridcolor = '#e9ecef',              # horizontal background grid 
            color = "#2f3061"                   # axis label color
        )
        
        fig_atp_attendance.add_annotation(text='Professional tennis players participate in tournaments following a zigzag pattern',
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(size=18,
                                color = "#2f3061"),
                    x = 0,
                    y = 1.2
                                         )
        
        fig_atp_attendance.add_annotation(text='In 2022 Holger Rune switched 6 times between the most important \
Grand Slam tournaments and the lowest tier ATP 250', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0,
                    y = 1.1)
        
        # output the graph
        fig_atp_attendance.show()

In [34]:
player_stats('Rune H.')