In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import calendar
from IPython.display import display, Markdown

In [2]:
# set number of rows to display 
pd.set_option('display.max_rows',100)

In [3]:
# import data
df_raw = pd.read_csv("../data/atp_tennis.csv",
                     parse_dates=['Date'], # convert the date to datetime
                     dtype={'Tournament':'string',
                            'Series':'string',
                            'Court':'string',
                            'Surface':'string',
                            'Round':'string',
                            'Player_1':'string',
                            'Player_2':'string',
                            'Winner':'string',
                            'Score':'string'
                           }
                    )

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60729 entries, 0 to 60728
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Tournament  60729 non-null  string        
 1   Date        60729 non-null  datetime64[ns]
 2   Series      60729 non-null  string        
 3   Court       60729 non-null  string        
 4   Surface     60729 non-null  string        
 5   Round       60729 non-null  string        
 6   Best of     60729 non-null  int64         
 7   Player_1    60729 non-null  string        
 8   Player_2    60729 non-null  string        
 9   Winner      60729 non-null  string        
 10  Rank_1      60729 non-null  int64         
 11  Rank_2      60729 non-null  int64         
 12  Pts_1       60729 non-null  int64         
 13  Pts_2       60729 non-null  int64         
 14  Odd_1       60729 non-null  float64       
 15  Odd_2       60729 non-null  float64       
 16  Score       60729 non-

### Data Preparation

In [5]:
# create new columns by parsing the match date
df_raw['Year'] = df_raw['Date'].dt.year
df_raw['Month_Number'] = df_raw['Date'].dt.month
df_raw['Day'] = df_raw['Date'].dt.day

In [6]:
# convert numeric months to names
df_raw['Month'] = df_raw['Month_Number'].apply(lambda x:calendar.month_abbr[x])

In [7]:
# create an empty Country column
df_raw['Zone'] = np.nan

# Australia
df_raw.loc[df_raw['Tournament'].isin(['Adelaide International 1',
                                      'Adelaide International 2',
                                      'Melbourne Summer Set',
                                      'Sydney Tennis Classic',
                                      'Australian Open',
                                      'ASB Classic',
                                      'Murray River Open'
                                      ]),'Zone'] = 'Australia'

# South America
df_raw.loc[df_raw['Tournament'].isin(['Argentina Open',
                                      'Cordoba Open',
                                      'Chile Open',
                                      'Rio Open']),'Zone'] = 'South America'

# US
df_raw.loc[df_raw['Tournament'].isin(['Atlanta Open',
                                      'Dallas Open',
                                      'Delray Beach Open',
                                      'Hall of Fame Championships',
                                      'San Diego Open',
                                      'U.S. Men\'s Clay Court Championships',
                                      'Winston-Salem Open at Wake Forest University',
                                      'Citi Open',
                                      'US Open',
                                      'BNP Paribas Open',
                                      'Miami Open',
                                      'Western & Southern Financial Group Masters',
                                      'BB&T Atlanta Open'
                                      ])
           ,'Zone'] = 'North America'
# Europe
df_raw.loc[df_raw['Tournament'].isin(['BMW Open',
                                      'Croatia Open',
                                      'Eastbourne International',
                                      'Eastbourne International',
                                      'European Open',
                                      'Firenze Open',
                                      'Generali Open',
                                      'Geneva Open',
                                      'Gijon Open',
                                      'Lyon Open',
                                      'Mallorca Championships',
                                      'Mercedes Cup',
                                      'Millennium Estoril Open',
                                      'Napoli Cup',
                                      'Nordea Open',
                                      'Open 13',
                                      'Open Sud de France',
                                      'Open de Moselle',
                                      'Rosmalen Grass Court Championships',
                                      'Serbia Open',
                                      'Sofia Open',
                                      'Stockholm Open',
                                      'Suisse Open Gstaad',
                                      'ABN AMRO World Tennis Tournament',
                                      'Barcelona Open',
                                      'Halle Open',
                                      'Queen\'s Club Championships',
                                      'Swiss Indoors',
                                      'Vienna Open',
                                      'French Open',
                                      'Wimbledon',
                                      'BNP Paribas Masters',
                                      'Internazionali BNL d\'Italia',
                                      'Monte Carlo Masters',
                                      'Mutua Madrid Open',
                                      'Masters Cup',
                                      'Srpska Open',
                                      'bett1HULKS Championship',
                                      'Viking International',
                                      'AnyTech365 Andalucia Open',
                                      'Belgrade Open',
                                      'Kremlin Cup',
                                      'St. Petersburg Open',
                                      'Emilia-Romagna Open',
                                      'Sardegna Open',
                                      'Stuttgart Open'
                                     ])
           ,'Zone'] = 'Europe'

# MENA
df_raw.loc[df_raw['Tournament'].isin(['Grand Prix Hassan II',
                                     'Qatar Exxon Mobil Open',
                                     'Tel Aviv Open',
                                     'Dubai Tennis Championships',
                                     'Antalya Open'])
           ,'Zone'] = 'Middle East'

# Asia
df_raw.loc[df_raw['Tournament'].isin(['Korea Open',
                                     'Maharashtra Open',
                                     'Rakuten Japan Open Tennis Championships',
                                     'Singapore Open'])
           ,'Zone'] = 'Asia'

# Mexico
df_raw.loc[df_raw['Tournament'].isin(['Los Cabos Open',
                                      'Abierto Mexicano'])
           ,'Zone'] = 'North America'
# Kazakhstan
df_raw.loc[df_raw['Tournament'].isin(['Astana Open'])
           ,'Zone'] = 'Asia'

# Canada
df_raw.loc[df_raw['Tournament'].isin(['Canadian Open'])
           ,'Zone'] = 'North America'

In [8]:
df_raw_2022 = df_raw[df_raw['Year'] == 2022]

### Data Visualization

In [9]:
# extract data for a given player
df_player_2022 = df_raw_2022[(df_raw_2022['Player_1'] == 'Rune H.') | (df_raw_2022['Player_2'] == 'Rune H.')] \
[['Year','Month_Number','Month','Day','Zone','Tournament']].drop_duplicates()

In [10]:
# sanity check to make sure there are no unassigned countries
df_player_2022[df_player_2022['Zone'].isna()]

Unnamed: 0,Year,Month_Number,Month,Day,Zone,Tournament


In [11]:
df_player_2022

Unnamed: 0,Year,Month_Number,Month,Day,Zone,Tournament
56414,2022,1,Jan,4,Australia,Adelaide International 1
56514,2022,1,Jan,17,Australia,Australian Open
56642,2022,2,Feb,1,South America,Cordoba Open
56725,2022,2,Feb,8,South America,Argentina Open
56859,2022,2,Feb,15,Europe,Open 13
56867,2022,2,Feb,16,Europe,Open 13
56978,2022,2,Feb,22,South America,Chile Open
57019,2022,3,Mar,12,North America,BNP Paribas Open
57052,2022,3,Mar,13,North America,BNP Paribas Open
57255,2022,4,Apr,12,Europe,Monte Carlo Masters


In [50]:
# numerate rows based on a grouping and sorting. Similar to ROW_NUMBER()
df_player_2022['RowNum'] = df_player_2022.sort_values(['Month_Number','Day'], ascending=[True,True]) \
             .groupby(['Tournament']) \
             .cumcount() + 1

In [52]:
# drop rows for the same tournament
df_player_2022.drop(df_player_2022[df_player_2022['RowNum'] > 1].index,inplace=True)

In [54]:
df_player_agg = df_player_2022[['Month_Number','Month','Zone']] \
    .groupby(['Month_Number','Month','Zone'])   \
    .value_counts().reset_index()              \
    .set_axis(['Month_Number','Month','Zone','Matches'], axis=1)

In [55]:
df_player_agg

Unnamed: 0,Month_Number,Month,Zone,Matches
0,1,Jan,Australia,2
1,2,Feb,Europe,1
2,2,Feb,South America,3
3,3,Mar,North America,1
4,4,Apr,Europe,3
5,5,May,Europe,2
6,6,Jun,Europe,3
7,7,Jul,Europe,3
8,8,Aug,North America,4
9,9,Sep,Europe,2


In [64]:
size = df_player_agg['Matches'].to_numpy()*8
size

array([16,  8, 24,  8, 24, 16, 24, 24, 32, 16, 16,  8])

In [102]:
# create plotly.graph_objects Figure()  and add trace
figure = go.Figure()
figure.add_scatter(
    x = df_player_agg.Month,
    y = df_player_agg.Zone,
    mode='markers',
    marker={'size': size, 
            'color': "#067bc2", 
            'opacity': 1}
    )

figure.add_annotation(text='Tennis players try to attend turnaments in adjacent time zones',
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(size=18,
                                color = "#2f3061"),
                    x = 0,
                    y = 1.2
                                         )
        
figure.add_annotation(text='In 2022 Holger Rune spent most of the time in Europe and a few months in Americas', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0,
                    y = 1.1)

figure.add_annotation(text='11 tournaments in 4 months', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0.45,
                    y = 0.45)

figure.add_annotation(text='5 tournaments', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0.91,
                    y = 0.45)

figure.add_annotation(text='4 tournaments', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0.73,
                    y = 0.85)

figure.add_annotation(text='4 tournaments', 
                    align = 'left',
                    showarrow = False,
                    xref = 'paper',
                    yref = 'paper',
                    font = dict(color = "#2f3061"),
                    x = 0.13,
                    y = 0.8)

figure.update_layout(
            font_family = "Outfit",
            font_size = 13,
            plot_bgcolor = '#f8f9fa',            
            xaxis_title = None,               # remove X title
            yaxis_title = None,               # remove Y title
            title_font_color = "#2f3061",
            margin=dict(t=100)
        )

figure.update_xaxes(
            ticks="outside",                    # place ticks outside X-axis
            ticklen=5,                          # ticks length 
            tickcolor = '#adb5bd',              # ticks color
            linecolor = '#adb5bd',              # axis line color
            color = "#2f3061",                  # axis label color
            showgrid=False,                     # hide vertical background grid
            
        )
        
figure.update_yaxes(
            shift= -10,                         # shift ticks left by 10 pixels
            anchor="free",                      # allows axis repositioning
            gridcolor = '#e9ecef',              # horizontal background grid 
            color = "#2f3061"                   # axis label color
        )

figure.show()