In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
# clean the tables
# change the column names to lowercase and replace spaces with underscores
# replace player_(\d+) with player
# remove empty columns
# remove empty rows
# replace - with NaN
# remove dollar signs and commas from columns that should be numeric
# convert columns to numeric
# remove extra underscores from column names
# IMPORTANT: some of these steps are dependent on the previous cleaning steps (h2_to_str)

def is_string_dtype(dtype):
    return pd.api.types.is_string_dtype(dtype)

# Function to find columns with dollar sign values
def columns_with_dollar_or_percent(df):
    # List to store columns with dollar or percent signs
    cols_with_symbols = []
    
    # Iterate over columns
    for col in df.columns:
        # Check if any value in the column contains a dollar sign or percent sign
        if df[col].astype(str).str.contains(r'\$|%').any():
            cols_with_symbols.append(col)
    
    return cols_with_symbols

def clean_table(df):

    # remove columns with names that contain 'Unnamed'
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]
    
    # substitute player_(\d+) or Player_(\d+) with player
    df.columns = [re.sub(r'Player \(\d+\)|player \(\d+\)', 'player', col) for col in df.columns]
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    df.columns = [re.sub(r'_+', '_', col) for col in df.columns]
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    df = df.replace('-', '')

    if ('player' not in df.columns):
        df.insert(0, 'player', np.nan)

    # Example: 'Carter Michael Carter' -> 'Michael Carter'
    reformatted_column = []
    for player in df['player']:
        if type(player) == str:
            reformatted_column.append(re.sub(r'^(.*?)\s+(.*?)\s+(\1)', r'\2 \1', player))
        else:
            reformatted_column.append(player)
    df['player'] = reformatted_column

    # check that column is string before using str.contains
    # get columns with dollar signs and commas
    cols = columns_with_dollar_or_percent(df)
    for col in cols:
        df[col] = df[col].str.replace('$', '')
        df[col] = df[col].str.replace('%', '')
        df[col] = df[col].str.replace(',', '')
        df[col] = df[col].str.replace('(', '-')
        df[col] = df[col].str.replace(')', '')
        df[col] = pd.to_numeric(df[col])

    return df