In [None]:
import pandas as pd
import numpy as np
from scipy import stats 
from scipy.stats import zscore

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly

import seaborn as sns

import ipywidgets

import json
import os
import sys
import io
import datetime

# Start by mapping 'games' data in original raw format into dataframe
Manually extracting the data from the website has the data in a 1 column list repeating every 4 rows.
1. Date
2. (team a) vs (Team b) - in Home vs Away order
3. (score) to (score)
4. game_id



In [None]:
#df_games = pd.DataFrame(columns = ['GAME_ID', 'DATE', 'SCORE_STR', 'TEAMS_STR'])
df_games = pd.DataFrame(columns = ['GAME_ID', 'DATE', 'SCORE_STR', 'TEAMS_STR', 'TOWN_TEAM_HOME', 'TOWN_TEAM_AWAY', 'GOALS_HOME', 'GOALS_AWAY', 'PTS_HOME', 'PTS_AWAY'])
df_games

fp_games = ['../data/Grade 6 Girls Soccer - G6G_FALL2022_SCORES_RAW.csv', 
            '../data/Grade 6 Girls Soccer - G6G_SPRING2022_SCORES_RAW.csv']

for fp in fp_games:
    print(fp)
    file1 = open(fp, 'r')
    lines = file1.readlines()

    count = 0
    # Strips the newline character
    for line in lines:
        count += 1
        lineStr = line.strip()
        #print(lineStr)
        if (count % 4) == 1:
            d_dateStr = lineStr
            #print(d_dateStr)
            d_date = datetime.datetime.strptime(d_dateStr, '%m/%d/%Y')
            #d_date = d_dateStr
        elif (count % 4) == 2:
            d_teams = lineStr
            teamsArr = d_teams.split('vs.')
            d_team_home = teamsArr[0].strip()
            d_team_away = teamsArr[1].strip()
        elif (count % 4) == 3:
            d_scores = lineStr
            #win = 3, tie = 1, loss = 0. Forfeit (-3 forfeiting team, 0 other); double forfeit (both -3).
            if ('forfeit' not in d_scores.lower()):
                goalsArr = d_scores.split('to')
                d_goals_home = int(goalsArr[0])
                d_goals_away = int(goalsArr[1])
                
                if (d_goals_home == d_goals_away):
                    d_pts_home = 1
                    d_pts_away = 1
                elif (d_goals_home > d_goals_away):
                    d_pts_home = 3
                    d_pts_away = 0
                else:
                    d_pts_home = 0
                    d_pts_away = 3
            elif ('away forfeit' in d_scores.lower()):
                d_pts_home = 0
                d_pts_away = -3
            elif ('home forfeit' in d_scores.lower()):
                d_pts_home = -3
                d_pts_away = 0
            elif ('double forfeit' in d_scores.lower()):
                d_pts_home = -3
                d_pts_away = -3
            else:
                print(f'SCORE ERROR({count}): {d_scores}')
            
            
        elif (count % 4) == 0:
            d_gameid = lineStr
            #df_games.append({'GAME_ID' : d_gameid, 'DATE' : d_date, 'SCORE_STR' : d_scores, 'TEAMS_STR' : d_teams}, ignore_index = True) 
            df_games.loc[len(df_games.index)] = [d_gameid, d_date, d_scores, d_teams, d_team_home, d_team_away, d_goals_home, d_goals_away, d_pts_home, d_pts_away]
    
    file1.close()

# Convert dataframe columns to desired types
#df_games['DATE']= pd.to_datetime(df_games['DATE'])
df_games['DATE'] = df_games['DATE'].astype('datetime64[ns]')
df_games['GOALS_HOME'] = df_games['GOALS_HOME'].astype('int')
df_games['GOALS_AWAY'] = df_games['GOALS_AWAY'].astype('int')
df_games['PTS_HOME'] = df_games['PTS_HOME'].astype('int')
df_games['PTS_AWAY'] = df_games['PTS_AWAY'].astype('int')


# Map date to determine spring vs fall and create numeric representation for first half spring vs second half fall.
#df_games['YEAR_HALF'] = df_games.apply(lambda x: x.timetuple().tm_yday)
#df_games['YEAR_HALF'] = df['date'].dt.year + df['date'].dt.month.gt(6).add(1).astype(str)
df_games['SEASON'] = np.where(df_games['DATE'].dt.month.le(6), df_games['DATE'].dt.year + 0, df_games['DATE'].dt.year + 0.5)



df_games


In [None]:
# Extract Division from GAME_ID header
#[0]: optional, S=spring, F=fall
#[1]: optional, YY
#[2]: G for Grade
#[3]: grade number
#[4]: G for girls, B for boys
#[5..'-']: Division id
#['-'...]: game number
#str.fullmatch("", case=False)

#df_temp = df_games['GAME_ID'].str.extract(r'(?P<SEASON_IDX>(\w\d\d)?)(?P<GRADE>\w\d)(?P<GENDER>\w)(?P<DIVISION>\w\d\w\d)')
df_temp = df_games['GAME_ID'].str.extract(r'(?P<SEASON_IDX>(\w\d\d)?)G(?P<GRADE>\d)(?P<GENDER>\w)(?P<DIVISION>\w\d\w\d)-(?P<GAME_IDX>\d\d\d\d\d)')
df_temp['GRADE'] = df_temp['GRADE'].astype('int')
df_temp['GAME_IDX'] = df_temp['GAME_IDX'].astype('int')
#df_temp.value_counts()
#print(df_temp.info())
#df_temp.describe()
#print(df_temp['DIVISION'].unique())
df_temp

df_games[['SEASON_IDX','GRADE','GENDER','DIVISION','GAME_IDX']] = df_temp[['SEASON_IDX','GRADE','GENDER','DIVISION','GAME_IDX']]


df_games
#df_games.info()

In [None]:

# Process string of <town> <team> into columns
uh = df_games['TOWN_TEAM_HOME'].unique()
df_teams = pd.DataFrame({'TOWN_TEAM' : uh})
df_teams = df_teams.sort_values(by='TOWN_TEAM')
#df_teams['TOWN_TEAM'].to_csv('../data/teams_export.csv', index=False)

# Add mapping of town to teams
df_towns = pd.read_csv('../data/towns.csv', header=None, names=['TOWN'])
df_towns = df_towns.sort_values(by='TOWN', key=lambda x: x.str.len(), ascending=False)
df_towns
for town in df_towns['TOWN']:
    i = len(town)
    #print(i, town)
    mask = df_teams['TOWN_TEAM'].str.startswith(town)
    df_teams.loc[mask, 'TOWN'] = town
    df_teams.loc[mask, 'TEAM'] = df_teams.TOWN_TEAM.apply(lambda x: x[i:])

df_teams.to_csv('../data/teams_export.csv', index=False)

df_teams


# Create standading dataframe per-season

In [None]:
df_standings = pd.DataFrame(columns = ['SEASON', 'DIVISION', 'TEAM', 'TOWN', 'PTS', 'GOALS_FOR', 'GOALS_AGAINST']
df_standings