Scrap 2023-2025 Data

In [28]:
import pandas as pd
import csv
import os
from pathlib import Path

# Set working directory to project root if run from notebooks/
if Path.cwd().name == "notebooks":
    os.chdir("..")

# Define folders – assume they already exist
raw_data_dir = Path("data/Raw")
cleaned_data_dir = Path("data/Cleaned")
metrics_data_dir = Path("data/Metrics")
current_season_dir = metrics_data_dir / "CurrentSeasonTeams"


url = 'https://www.basketball-reference.com/wnba/years/2023_totals.html'
df2023 = pd.read_html(url)[0]  # First table is the player totals
df2023.to_csv(raw_data_dir / '2023Data.csv', index=False, sep=',', encoding='utf-8', lineterminator='\n')
df2023 = df2023[df2023['Player'] != 'Player']  # Drop header rows repeated in table
df2023 = df2023.drop_duplicates()        # Remove duplicate entries, if any
tot_players = df2023[df2023['Team'] == 'TOT']['Player']
df2023 = pd.concat([
    df2023[df2023['Team'] == 'TOT'],                              # all TOT rows
    df2023[~df2023['Player'].isin(tot_players) & (df2023['Team'] != 'TOT')]  # single-team players
], ignore_index=True)
df2023['MP'] = pd.to_numeric(df2023['MP'])
df2023 = df2023.drop('G.1', axis=1)
df2023 = df2023.drop('MP.1', axis=1)

cols_to_exclude = ['Player', 'Team', 'Pos']
cols_to_convert = df2023.columns.difference(cols_to_exclude)

df2023[cols_to_convert] = df2023[cols_to_convert].apply(pd.to_numeric, errors='coerce')

df2023

url = 'https://www.basketball-reference.com/wnba/years/2024_totals.html'
df2024 = pd.read_html(url)[0]  # First table is the player totals
df2024.to_csv(raw_data_dir / '2024Data.csv', index=False, sep=',', encoding='utf-8', lineterminator='\n')
df2024 = df2024[df2024['Player'] != 'Player']  # Drop header rows repeated in table
df2024 = df2024.drop_duplicates()        # Remove duplicate entries, if any
tot_players = df2024[df2024['Team'] == 'TOT']['Player']
df2024 = pd.concat([
    df2024[df2024['Team'] == 'TOT'],                              # all TOT rows
    df2024[~df2024['Player'].isin(tot_players) & (df2024['Team'] != 'TOT')]  # single-team players
], ignore_index=True)
df2024['MP'] = pd.to_numeric(df2024['MP'])
df2024 = df2024.drop('G.1', axis=1)
df2024 = df2024.drop('MP.1', axis=1)

cols_to_exclude = ['Player', 'Team', 'Pos']
cols_to_convert = df2024.columns.difference(cols_to_exclude)

df2024[cols_to_convert] = df2024[cols_to_convert].apply(pd.to_numeric, errors='coerce')

df2024

Unnamed: 0,Player,Team,Pos,G,MP,GS,FG,FGA,FG%,3P,...,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Rachel Banham,TOT,G,37,586,9,71,202,0.351,52,...,20,0.800,7,53,40,14,7,20,58,210
1,Morgan Bertsch,TOT,F,11,100,0,6,30,0.200,2,...,10,0.800,8,17,2,1,1,13,14,22
2,Monique Billings,TOT,F,37,843,14,107,230,0.465,3,...,71,0.803,65,213,53,27,19,65,84,274
3,Crystal Dangerfield,TOT,G,33,441,5,37,131,0.282,17,...,17,0.882,2,34,62,11,6,26,25,106
4,Queen Egbo,TOT,F-C,8,29,0,5,12,0.417,0,...,2,1.000,3,10,0,3,1,0,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Kiana Williams,SEA,G,13,45,0,4,16,0.250,3,...,0,,0,2,5,0,0,2,3,11
153,A'ja Wilson,LVA,C,38,1308,38,385,743,0.518,19,...,275,0.844,79,451,88,68,98,48,67,1021
154,Jackie Young,LVA,G,37,1205,37,201,467,0.430,70,...,128,0.867,26,161,197,37,8,78,92,583
155,Li Yueru,LAS,C,38,546,2,68,147,0.463,4,...,59,0.898,55,140,22,12,9,33,80,193


Adding Defensive Rebounds

In [29]:
df2023['DRB'] = df2023['TRB'] - df2023['ORB']

df2024['DRB'] = df2024['TRB'] - df2024['ORB']

# Define target and reference columns
col_to_move = 'DRB'
before_col = 'TRB'

# Get current column order and remove the column to move
cols = df2023.columns.tolist()
cols.remove(col_to_move)

# Find index to insert the column before the target
insert_at = cols.index(before_col)

# Insert the column at the desired position
cols.insert(insert_at, col_to_move)

# Reorder the DataFrame
df2023 = df2023[cols]


# Get current column order and remove the column to move
cols = df2024.columns.tolist()
cols.remove(col_to_move)

# Find index to insert the column before the target
insert_at = cols.index(before_col)

# Insert the column at the desired position
cols.insert(insert_at, col_to_move)

# Reorder the DataFrame
df2024 = df2024[cols]

df2023.to_csv(cleaned_data_dir / '2023Data.csv', index=False, sep=',', encoding='utf-8', lineterminator='\n')

df2024.to_csv(cleaned_data_dir / '2024Data.csv', index=False, sep=',', encoding='utf-8', lineterminator='\n')