In [5]:
import pandas as pd
import numpy as np
import re
import os

In [11]:
# clean the tables
# change the column names to lowercase and replace spaces with underscores
# replace player_(\d+) with player
# remove empty columns
# remove empty rows
# replace - with NaN
# remove dollar signs and commas from columns that should be numeric
# convert columns to numeric
# remove extra underscores from column names
# IMPORTANT: some of these steps are dependent on the previous cleaning steps (h2_to_str)

def is_string_dtype(dtype):
    return pd.api.types.is_string_dtype(dtype)

# Function to find columns with dollar sign values
def columns_with_dollar_or_percent(df):
    # List to store columns with dollar or percent signs
    cols_with_symbols = []
    
    # Iterate over columns
    for col in df.columns:
        # Check if any value in the column contains a dollar sign or percent sign
        if df[col].astype(str).str.contains(r'\$|%').any():
            cols_with_symbols.append(col)
    
    return cols_with_symbols

def format_table(df):

    # remove columns with names that contain 'Unnamed'
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]
    
    # substitute player_(\d+) or Player_(\d+) with player
    df.columns = [re.sub(r'Player \(\d+\)|player \(\d+\)', 'player', col) for col in df.columns]
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    df.columns = [re.sub(r'_+', '_', col) for col in df.columns]
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    df = df.replace('-', '')

    if ('player' not in df.columns):
        df.insert(0, 'player', np.nan)

    # Example: 'Carter Michael Carter' -> 'Michael Carter'
    reformatted_column = []
    for player in df['player']:
        if type(player) == str:
            reformatted_column.append(re.sub(r'^(.*?)\s+(.*?)\s+(\1)', r'\2 \1', player))
        else:
            reformatted_column.append(player)
    df['player'] = reformatted_column

    # check that column is string before using str.contains
    # get columns with dollar signs and commas
    cols = columns_with_dollar_or_percent(df)
    for col in cols:
        df[col] = df[col].str.replace('$', '')
        df[col] = df[col].str.replace('%', '')
        df[col] = df[col].str.replace(',', '')
        df[col] = df[col].str.replace('(', '-')
        df[col] = df[col].str.replace(')', '')
        df[col] = pd.to_numeric(df[col])

    return df

# adds position information to the dataframe like unit (offense, defense, special teams)
def add_features(df):
    positions = pd.read_csv('data/positions.csv')
    df = df.merge(positions, left_on='pos', right_on='pos_short', how='left')
    return df

In [None]:
# loop through raw files and clean them
base_dir = os.getcwd() + '/data/teams/'
team_codes = pd.read_csv(os.getcwd() + '/data/team_codes.csv')
team_codes = team_codes.set_index('team_name')
team_codes = team_codes.to_dict()['team_code']

for team_name in team_codes.keys():
    print(f'Cleaning data for {team_name}...')
    year_folders = [folder for folder in os.listdir(base_dir + team_name) if os.path.isdir(base_dir + team_name + '/' + folder)]
    for year in year_folders:
        raw_files = [f for f in os.listdir(base_dir + team_name + '/' + year + '/raw/') if f.endswith('.csv')]
        clean_folder = base_dir + team_name + '/' + year + '/clean/'
        if os.path.exists(clean_folder):
            continue
        os.mkdir(clean_folder)
        for file in raw_files:
            df = pd.read_csv(base_dir + team_name + '/' + year + '/raw/' + file)
            if 'cap_totals' in file: # skip cap_totals files for now
                continue
            df = format_table(df)
            df = add_features(df)
            file_new = file.replace('raw', 'clean')
            df.to_csv(clean_folder + file_new, index=False)
            print('Cleaned ' + file)