# Time Series Analysis of ATP vs. WTA Upsets

This notebook takes a look at ATP and WTA Tour match data dating back to 1968.  In particular, I analyze the fraction of matches per year that resulted in an upset.  The final cell has an interactive plot for which the user can specify numerous variables:
- How an upset is defined.
- How many years of data to use for a moving average (to smooth the results).
- What time frame (in years) to visualize.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # For plotting
import matplotlib.ticker as ticker # For formatting y-axis labels as percents
from ipywidgets import interact, fixed # For generating an interactive plot
import ipywidgets as widgets # For widgets in interactive plot

# For showing plots in the notebook
% matplotlib inline 

## Download Data into Pandas

In [None]:
# Generate list of URLs to download CSVs from
# Source:
# Jeff Sackmann
# https://github.com/JeffSackmann (`tennis_atp` and `tennis_wta` repositories)

years = np.arange(1968, 2019).astype(str)
atpfiles = ['https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_' + x + '.csv' for x in years]
wtafiles = ['https://raw.githubusercontent.com/JeffSackmann/tennis_wta/master/wta_matches_' + x + '.csv' for x in years]

# Takes a little time to collect all the data
# Need to specify encoding for WTA data to get around a UnicodeDecodeError
atp = pd.concat([pd.read_csv(f) for f in atpfiles], ignore_index=True)
wta = pd.concat([pd.read_csv(f, encoding='latin-1') for f in wtafiles], ignore_index=True)

## Format and Filter Data for Plotting

In [None]:
# Convert `tourney_date` column to datetime
atp.tourney_date = pd.to_datetime(atp.tourney_date, format='%Y%m%d')
wta.tourney_date = pd.to_datetime(wta.tourney_date, format='%Y%m%d')

# Filter data:
# - Only consider Masters Series and Grand Slam tournaments
# - Only consider clay, grass, and hard surfaces (not carpet)
# - Only keep matches for which the winner and loser ranks are present
atp_filtered = atp.loc[
    (atp.winner_rank >= 1) &
    (atp.loser_rank >= 1) &
    ((atp.tourney_level == 'M') | (atp.tourney_level == 'G')) &
    (atp.surface != "Carpet"),
    ['surface', 'tourney_level', 'tourney_date', 'winner_rank', 'loser_rank']]
atp_filtered['year'] = atp_filtered.tourney_date.dt.year

wta_filtered = wta.loc[
    (wta.winner_rank >= 1) &
    (wta.loser_rank >= 1) &
    ((wta.tourney_level == 'M') | (wta.tourney_level == 'G')) &
    (wta.surface != "Carpet"),
    ['surface', 'tourney_level', 'tourney_date', 'winner_rank', 'loser_rank']]
wta_filtered['year'] = wta_filtered.tourney_date.dt.year

# View data sample
atp_filtered.head()

## Create Interactive Plots

In [None]:
def upset_plot(atp, wta, Threshold, Rolling, Years):
    '''Method to generate two side-by-side plots (ATP and WTA, respectively)
    of the fraction of matches resulting in an upset (defined by user) by 
    year.'''
    
    # Define what an "upset" is
    atp['upset'] = atp.winner_rank - atp.loser_rank >= Threshold
    wta['upset'] = wta.winner_rank - wta.loser_rank >= Threshold
    
    # Filter to only consider user-selected year range:
    atp = atp.loc[(atp.year >= Years[0]) & (atp.year <= Years[1])]
    wta = wta.loc[(wta.year >= Years[0]) & (wta.year <= Years[1])]
    
    # Calculate total upsets by year and surface type
    atp_upsets = pd.DataFrame(
        atp.groupby(['year','surface'])['upset'].sum())
    wta_upsets = pd.DataFrame(
        wta.groupby(['year','surface'])['upset'].sum())
        
    # Calculate total matches by year and surface type
    atp_total_matches = pd.DataFrame(
        atp.groupby(['year','surface'])['upset'].apply(lambda x: x.shape[0]))
    wta_total_matches = pd.DataFrame(
        wta.groupby(['year','surface'])['upset'].apply(lambda x: x.shape[0]))
 
    # Generate side-by-side plots
    
    # Set color palette so that colors align with surface type:
    # Red = clay, green = grass, blue = hard
    mycolors = ['r','g','b']
    
    # Create figure and axes (2) for subplots
    # Set axis and figure settings
    fig, (ax, ax2) = plt.subplots(ncols=2, sharey=True)
    ax.set_ylabel("Percent of Matches Resulting in Upset", fontsize=16)
    ax2.set_ylabel("Percent of Matches Resulting in Upset", fontsize=16)
    ax.set_title("ATP", fontsize=18)
    ax2.set_title("WTA", fontsize=18)
    ax.tick_params(labelsize=12)
    ax2.tick_params(labelsize=12)
    fig.patch.set_facecolor('white')
    fig.set_size_inches(16, 8)
    fig.suptitle(str(Rolling)+"-Year Moving Average of Upsets by Surface Type\n"+
                 "(Upset: Winner Rank - Loser Rank ≥ " + str(Threshold)+")", fontsize=20)
    
    # ATP plot
    atp_plot = pd.DataFrame(
        atp_upsets.div(atp_total_matches)*100).reset_index(
        level=['year', 'surface']).pivot(
        index='year', columns='surface', values='upset').rename_axis(
        '').rolling(Rolling).mean().plot(
        kind='line',
        xlim=[Years[0], Years[1]],
        ylim=[0, 50],
        color=mycolors,
        ax=ax)
    
    # WTA plot
    wta_plot = pd.DataFrame(
        wta_upsets.div(wta_total_matches)*100).reset_index(
        level=['year', 'surface']).pivot(
        index='year', columns='surface', values='upset').rename_axis(
        '').rolling(Rolling).mean().plot(
        kind='line',
        xlim=[Years[0], Years[1]],
        ylim=[0, 50],
        color=mycolors,
        ax=ax2)
    
    # Show y-axis labels as percents
    ax.yaxis.set_major_formatter(ticker.PercentFormatter())

In [None]:
# Generate interactive plots
interact(upset_plot,
         atp=fixed(atp_filtered),
         wta=fixed(wta_filtered),
         Threshold=widgets.IntSlider(min=1, max=50, step=1, value=5, description='Ranking Difference Threshold:'),
         Rolling=widgets.IntSlider(min=1, max=10, step=1, value=3, description='Moving Average (years):'),
         Years=widgets.IntRangeSlider(min=1968, max=2018, step=1, value=[1990, 2018]))