# WTA stats: Serena Williams

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np



In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [5]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

In [6]:
data_list = []
for i in range(2010, 2023):
    try:
        data_list.append(
            pd.read_json(
                f"https://api.wtatennis.com/tennis/stats/{i}/Current_Rank?page=0&pageSize=500"
            )
        )
    except:
        continue

In [7]:
df = pd.concat(data_list)

In [8]:
df[df["First_Name"] == "SERENA"]

Unnamed: 0,tourn_year,PlayerNbr,Last_Name,First_Name,Nationality,Current_Rank,Aces,Double_Faults,First_Serves_Won,First_Serves_Played,Second_Serves_Won,Second_Serves_Played,Break_Points_Faced,Break_Points_Lost,Service_Games_Played,Return_Games_Played,Break_Point_Chances,Break_Points_Converted,First_Serve_Return_Chances,First_Return_Won,Second_Return_Chances,Second_Return_Won,first_serve_won_percent,second_serve_won_percent,first_return_percent,second_return_percent,breakpoint_converted_percent,first_serve_percent,return_games_won_percent,breakpoint_saved_percent,service_games_won_percent,service_points_won_percent,return_points_won_percent,total_points_won_percent,MatchCount
91,2010,230234,WILLIAMS,SERENA,USA,9999,248,83,931,1236,346,757,167,57,306,303,263,119,1335,512,803,468,75.3,45.7,38.4,58.3,45.2,62.0,39.3,65.9,81.4,64.1,45.8,54.6,29
77,2011,230234,WILLIAMS,SERENA,USA,9999,200,62,716,933,343,649,114,37,254,244,208,98,966,355,643,376,76.7,52.9,36.7,58.5,47.1,59.0,40.2,67.5,85.4,66.9,45.4,56.1,25
53,2012,230234,WILLIAMS,SERENA,USA,9999,484,127,1447,1859,681,1262,205,66,527,504,472,236,2165,917,1288,770,77.8,54.0,42.4,59.8,50.0,59.6,46.8,67.8,87.5,68.2,48.9,58.0,58
47,2013,230234,WILLIAMS,SERENA,USA,9999,480,178,2153,2882,895,1748,332,117,738,716,714,386,2824,1261,1931,1190,74.7,51.2,44.7,61.6,54.1,62.2,53.9,64.8,84.1,65.8,51.5,58.6,80
29,2014,230234,WILLIAMS,SERENA,USA,9999,452,208,1549,2073,725,1442,284,109,571,563,510,253,2313,961,1468,847,74.7,50.3,41.5,57.7,49.6,59.0,44.9,61.6,80.9,64.7,47.8,55.9,60
30,2015,230234,WILLIAMS,SERENA,USA,9999,498,199,1533,2019,722,1480,287,105,557,552,493,242,2293,925,1403,829,75.9,48.8,40.3,59.1,49.1,57.7,43.8,63.4,81.1,64.4,47.5,55.7,53
18,2016,230234,WILLIAMS,SERENA,USA,9999,324,116,1146,1513,483,962,179,65,401,393,377,168,1631,653,1030,608,75.7,50.2,40.0,59.0,44.6,61.1,42.7,63.7,83.8,65.8,47.4,56.3,41
12,2017,230234,WILLIAMS,SERENA,USA,9999,71,35,246,311,114,241,42,16,91,88,77,36,357,133,225,132,79.1,47.3,37.3,58.7,46.8,56.3,40.9,61.9,82.4,65.2,45.5,55.1,9
8,2018,230234,WILLIAMS,SERENA,USA,9999,170,81,657,883,291,603,114,50,239,230,160,78,924,343,586,314,74.4,48.3,37.1,53.6,48.8,59.4,33.9,56.1,79.1,63.8,43.5,53.6,24
4,2019,230234,WILLIAMS,SERENA,USA,9999,206,88,801,1066,362,734,137,60,301,290,270,119,1217,468,720,427,75.1,49.3,38.5,59.3,44.1,59.2,41.0,56.2,80.1,64.6,46.2,55.1,31


In [9]:
df["name"] = df["First_Name"] + " " + df["Last_Name"]

In [10]:
cols = [
    "tourn_year",
    "name",
    "Aces",
    "first_serve_won_percent",
    "service_games_won_percent",
    "breakpoint_saved_percent",
    "second_return_percent",
    "breakpoint_converted_percent",
]

In [11]:
df_slim = df[cols].copy()

In [12]:
cols

['tourn_year',
 'name',
 'Aces',
 'first_serve_won_percent',
 'service_games_won_percent',
 'breakpoint_saved_percent',
 'second_return_percent',
 'breakpoint_converted_percent']

In [13]:
df_slim["aces_rank"] = df_slim["Aces"].rank(method="max", ascending=False)
df_slim["first_serve_won_percent_rank"] = df_slim["first_serve_won_percent"].rank(
    method="max", ascending=False
)
df_slim["service_games_won_percent_rank"] = df_slim["service_games_won_percent"].rank(
    method="max", ascending=False
)
df_slim["breakpoint_saved_percent_rank"] = df_slim["breakpoint_saved_percent"].rank(
    method="max", ascending=False
)
df_slim["second_return_percent_rank"] = df_slim["second_return_percent"].rank(
    method="max", ascending=False
)
df_slim["breakpoint_converted_percent_rank"] = df_slim[
    "breakpoint_converted_percent"
].rank(method="max", ascending=False)

In [14]:
df_slim[df_slim["name"] == "SERENA WILLIAMS"]

Unnamed: 0,tourn_year,name,Aces,first_serve_won_percent,service_games_won_percent,breakpoint_saved_percent,second_return_percent,breakpoint_converted_percent,aces_rank,first_serve_won_percent_rank,service_games_won_percent_rank,breakpoint_saved_percent_rank,second_return_percent_rank,breakpoint_converted_percent_rank
91,2010,SERENA WILLIAMS,248,75.3,81.4,65.9,58.3,45.2,23.0,17.0,17.0,49.0,197.0,581.0
77,2011,SERENA WILLIAMS,200,76.7,85.4,67.5,58.5,47.1,43.0,11.0,5.0,33.0,183.0,450.0
53,2012,SERENA WILLIAMS,484,77.8,87.5,67.8,59.8,50.0,3.0,8.0,3.0,32.0,129.0,313.0
47,2013,SERENA WILLIAMS,480,74.7,84.1,64.8,61.6,54.1,4.0,25.0,8.0,64.0,77.0,150.0
29,2014,SERENA WILLIAMS,452,74.7,80.9,61.6,57.7,49.6,6.0,25.0,21.0,118.0,237.0,326.0
30,2015,SERENA WILLIAMS,498,75.9,81.1,63.4,59.1,49.1,1.0,13.0,19.0,91.0,153.0,341.0
18,2016,SERENA WILLIAMS,324,75.7,83.8,63.7,59.0,44.6,13.0,14.0,10.0,80.0,155.0,617.0
12,2017,SERENA WILLIAMS,71,79.1,82.4,61.9,58.7,46.8,229.0,6.0,13.0,116.0,169.0,467.0
8,2018,SERENA WILLIAMS,170,74.4,79.1,56.1,53.6,48.8,66.0,27.0,33.0,360.0,654.0,355.0
4,2019,SERENA WILLIAMS,206,75.1,80.1,56.2,59.3,44.1,40.0,20.0,24.0,350.0,143.0,658.0


In [15]:
serena_rank_year = pd.read_html(
    "https://www.wtatennis.com/players/230234/serena-williams#rankingshistory"
)[1]

In [16]:
serena_rank_year.drop(["Unnamed: 1", "Unnamed: 3"], axis=1, inplace=True)

In [17]:
serena_rank_year.columns = serena_rank_year.columns.str.lower()

In [18]:
serena_rank_year

Unnamed: 0,date,top rank by year,year-end ranking
0,2022,41.0,
1,2021,7.0,41.0
2,2020,8.0,11.0
3,2019,8.0,10.0
4,2018,15.0,16.0
5,2017,1.0,22.0
6,2016,1.0,2.0
7,2015,1.0,1.0
8,2014,1.0,1.0
9,2013,1.0,1.0


In [19]:
serena_rank_week = pd.read_html(
    "https://www.wtatennis.com/players/230234/serena-williams#rankingshistory"
)[3].drop(["Unnamed: 1"], axis=1)

In [20]:
# serena_rank_week.drop(['Unnamed: 1'], axis=1, inplace=True)

In [21]:
serena_rank_week.columns = serena_rank_week.columns.str.lower().str.replace(
    " ", "_", regex=False
)

In [22]:
serena_rank_week["date"] = pd.to_datetime(serena_rank_week["week"])
serena_rank_week["year"] = serena_rank_week["date"].dt.year.astype(str)
serena_rank_week["week_number"] = serena_rank_week["date"].dt.isocalendar().week

In [23]:
def getSuffix(n):
    if n < 0:
        raise Exception("Ordinal negative numbers are not allowed")
    if n % 100 in [11, 12, 13]:
        return "th"
    if n % 10 == 1:
        return "st"
    if n % 10 == 2:
        return "nd"
    if n % 10 == 3:
        return "rd"
    return "th"

In [24]:
serena_rank_week["ordinal"] = serena_rank_week["week_by_week_ranking"].astype(
    str
) + serena_rank_week["week_by_week_ranking"].apply(getSuffix)

In [25]:
serena_rank_week["decade"] = serena_rank_week["year"].str[:3] + "0s"

In [26]:
serena_rank_week = serena_rank_week[serena_rank_week["year"] > "1999"].copy()

In [27]:
weeks = len(serena_rank_week)
weeks

1002

In [28]:
weeks_top_10 = len(serena_rank_week[serena_rank_week["week_by_week_ranking"] <= 10])
weeks_top_10

735

In [29]:
((weeks_top_10 / weeks) * 100)

73.35329341317365

In [30]:
weeks_1 = len(serena_rank_week[serena_rank_week["week_by_week_ranking"] <= 1])

In [31]:
((weeks_1 / weeks) * 100)

27.844311377245507

In [32]:
weeks_in_top10 = (
    serena_rank_week[serena_rank_week["week_by_week_ranking"] <= 10]
    .groupby("year")
    .agg({"week_by_week_ranking": "count"})
    .reset_index()
)

In [33]:
weeks_per_year = (
    serena_rank_week.groupby("year")
    .agg({"week_by_week_ranking": "count"})
    .reset_index()
)

In [34]:
alt.Chart(weeks_in_top10).mark_bar().encode(
    x="year:O", y=alt.Y("week_by_week_ranking", title=" ")
).properties(title="Serena Williams: No. of weeks in world top 10, by year")

In [47]:
line = (
    alt.Chart(
        serena_rank_week[serena_rank_week["week_by_week_ranking"] < 50],
        padding={"left": -30, "top": 0, "right": 0, "bottom": 0},
    )
    .mark_circle(size=5)
    .encode(
        x=alt.X("date:T", axis=alt.Axis(tickCount=5), title=""),
        y=alt.Y(
            "week_by_week_ranking:Q",
            title=" ",
            scale=alt.Scale(domain=[50, 0]),
            axis=alt.Axis(tickCount=5),
        ),
        color=alt.condition(
            "datum.week_by_week_ranking == 1",
            alt.value("#00d4d8"),
            alt.value("#aaa"),
        ),
        # facet=alt.Facet("decade", columns=2),
    )
)

chart = (
    (line)
    .properties(title="", width=360, height=300)
    .configure_legend(symbolType="stroke", orient="top")
)
chart

In [36]:
serena_rank_week[serena_rank_week["week_by_week_ranking"] < 100].to_csv(
    "data/processed/serena-by-week.csv", index=False
)

In [37]:
weeks_in_top10.to_csv("data/processed/serena-weeks-top-10.csv", index=False)