## Get Summaries for Visualizing Segment Race Times (and proportions)

#### Categorize And Clean Data

In [1]:
# Try to use Seaborn/Matplotlib to replicate another cool viz from Cole Nusaumbber Knaflic's book Storytelling with Data.
# Inspiration from :
# https://github.com/adamribaudo/storytelling-with-data-ggplot
# Using code from the excellent:
# https://github.com/empathy87/storytelling-with-data

## Import libraries
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
from matplotlib import transforms
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FixedLocator, FixedFormatter
from pandas import DataFrame

%matplotlib inline

In [2]:
df_Apr = pd.read_csv ('..\Data\April_2022.csv'  )
df_Apr['Month'] = "Apr"

df_Oct= pd.read_csv ('..\Data\October_2022.csv'  )
df_Oct['Month'] = "Oct"

In [3]:
df = pd.concat  ( [  df_Apr , df_Oct ] )

#### Calculate Section Times

In [71]:
df['0-5km']   = df['5K_MINS']
df['5-10km']  = df['10K_MINS']  - df['5K_MINS']
df['10-15km'] = df['15K_MINS']  - df['10K_MINS']
df['15-20km'] = df['20K_MINS']  - df['15K_MINS']

In [72]:
df.Month.value_counts()

Oct    13158
Apr    11572
Name: Month, dtype: int64

In [73]:
# Now Find the means for each section and Category
df.groupby (['Month' ] ).agg (
     num_runners=('RACE_NUMBER', 'count' )
    ,time_mins = ('CHIP_TIME_MINS', 'mean' )
).reset_index(drop=False)
#df_agg1.sort_values('CATEGORY')

Unnamed: 0,Month,num_runners,time_mins
0,Apr,11572,130.399274
1,Oct,13158,130.940266


In [74]:
df.loc [ df.CATEGORY.isna(), 'CATEGORY'] = 'NOT_GIVEN'
# There are two 'NOT_GIVEN' rows - drop them

df = df [ df.CATEGORY != 'NOT_GIVEN'  ]

#### Columns

In [75]:
df.columns

Index(['RACE_NUMBER', 'FIRST_NAME', 'SURNAME', 'GENDER', 'GENDER_POSITION',
       'CATEGORY', 'CATEGORY_POSITION', 'CLUB', '5K', '10K', '15K', '20K',
       'OVERALL_POSITION', 'CHIP_TIME', 'CHIP_POSITION', 'CHIP_TIME_MINS',
       '5K_MINS', '10K_MINS', '15K_MINS', '20K_MINS', 'min_per_km_course',
       'min_per_km_0_5', 'min_per_km_6_10', 'min_per_km_11_15',
       'min_per_km_16_20', 'min_per_km_21_FINISH', '5K_10K_SECTION_TIME',
       '10K_15K_SECTION_TIME', '15K_20K_SECTION_TIME', 'Month',
       '0K_5K_SECTION_TIME', 'ANALYSIS_CATEGORY', 'Label', '0-5km', '5-10km',
       '10-15km', '15-20km'],
      dtype='object')

#### Utility function

In [76]:
def flattenHierarchicalCol(col,sep = '_'):
    if not type(col) is tuple:
        return col
    else:
        new_col = ''
        for leveli,level in enumerate(col):
            if not level == '':
                if not leveli == 0:
                    new_col += sep
                new_col += level
        return new_col

#df_piv.columns = df_piv.columns.map(flattenHierarchicalCol)
#df_piv.head(10)


#### Recategorize

In [77]:
df['ANALYSIS_CATEGORY'] = "Normal"

In [78]:
#df.loc [df.CATEGORY.fillna ('No').str.contains ('OPEN')  , 'ENTRY_STATUS'] = "Elite"

boolIndex = np.logical_and ( df.CHIP_TIME_MINS <= 60 + 30  , df.GENDER== 'Male')
df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Elite'

boolIndex = np.logical_and ( df.CHIP_TIME_MINS <= 60 + 40 , df.GENDER== 'Female')
df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Elite'

#### Extract our rows from  the dataset.



In [79]:

boolIndex =    ( df.FIRST_NAME.str.contains ('Ellena') ) &\
               ( df.SURNAME.str.contains ( 'Green') )    &\
               (  df.Month == "Apr" )

df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Us'


boolIndex =    ( df.FIRST_NAME.str.contains ('Ellena') ) &\
               ( df.SURNAME.str.contains ( 'Green') )    &\
               (  df.Month == "Oct" )

df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Us'



boolIndex =    ( df.FIRST_NAME.str.contains ('John') ) &\
               ( df.SURNAME.str.contains ( 'Steedman') )&\
               (  df.Month == "Apr" )

df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Us'


boolIndex =    ( df.FIRST_NAME.str.contains ('John') ) &\
               ( df.SURNAME.str.contains ( 'Steedman') )&\
               (  df.Month == "Oct" )

df.loc [ boolIndex , 'ANALYSIS_CATEGORY']  = 'Us'


In [80]:
df.ANALYSIS_CATEGORY.value_counts()

Normal    23261
Elite      1465
Us            4
Name: ANALYSIS_CATEGORY, dtype: int64

#### Improve the row labelling

In [81]:
df['Label'] = df.ANALYSIS_CATEGORY
df.loc[(df.Label == 'Us') & (df.GENDER == 'Male') , 'Label'] = 'John'
df.loc[(df.Label == 'Us') & (df.GENDER == 'Female') , 'Label'] = 'Ellena'

#### Extract the rows for us.

In [102]:
df_us = df [ df.ANALYSIS_CATEGORY == 'Us']

#### Unpivot df_us  in order to do a line plot

In [107]:
lstSections = ['0-5km', '5-10km', '10-15km' ,'15-20km' ]

df_us2 = df_us [ ['Label', 'CHIP_TIME_MINS', 'Month' ]   + lstSections  ] #, 'CHIP_TIME_MINS'

df_unpiv = pd.melt(df_us2, id_vars= ['Label', 'CHIP_TIME_MINS',  'Month'  ], value_vars= lstSections, value_name = 'time_mins', var_name='SECTION')
df_unpiv.head(5)

Unnamed: 0,Label,CHIP_TIME_MINS,Month,SECTION,time_mins
0,John,117.4,Apr,0-5km,24.5
1,Ellena,163.25,Apr,0-5km,36.416667
2,John,109.066667,Oct,0-5km,23.816667
3,Ellena,134.9,Oct,0-5km,29.833333
4,John,117.4,Apr,5-10km,27.416667


In [108]:
df_unpiv.to_csv ( '..\Data\Staging\chart2_data.csv', index=False )