# Relative Age Effect in Cricket

1. Under-19 World Cup data
2. Australia senior team data

In [None]:
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import datetime
from dateutil.relativedelta import relativedelta
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import calendar

def get_soup(url):
    """Imports URL as beautifulsoup object
    """
    try:
        page = urlopen(url)
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, "lxml")# "html.parser")
        return soup
    except:
        return None

    
def expected_perc(month):
    """Returns expected percentage of people born in each month
    """
    if month in [1,3,5,7,8,10,12]:
        days_avg = 31
    elif month in [4,6,9,11]:
        days_avg = 30
    else:
        days_avg = 28.25
    
    return days_avg / 365.25

team_dict = {
    'Afghanistan': 'AFG',
    'Australia': 'AUS',
    'Bangladesh': 'BDESH',
    'England': 'ENG',
    'India': 'INDIA',
    'Ireland': 'IRE',
    'New Zealand': 'NZ',
    'Pakistan': 'PAK',
    'South Africa': 'SA',
    'Sri Lanka': 'SL',
    'West Indies': 'WI',
    'Zimbabwe': 'ZIM',
    'ICC World XI': 'ICC'
}

## 1. Under-19 World Cup data

In [None]:
# Read squad information
squad_urls = {
    '2019/20': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=1204639',
    '2017/18': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=1116872',
    '2015/16': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=949357',
    '2013/14': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=700273',
    '2012': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=526969',
    '2009/10': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=418157',
    '2007/08': 'https://www.espncricinfo.com/ci/content/squad/index.html?object=288278'
             }


p1 = re.compile("(.*) Under-19s Squad")
p2 = re.compile("\((.*)\)")

# Intialize squads dataframe
squads = pd.DataFrame(columns=['tournament_year', 'tournament_url', 'team', 'squad_url', 'squad_extract_date'])
    
for tournament_year, tournament_url in squad_urls.items():
    # Import tournament
    soup = get_soup(tournament_url)
    
    # Get metadata of each squad
    for span in soup.find_all('span'):
        if 'Under-19s Squad' in span.text:
            # Extract from text
            team = p1.search(span.text).group(1)
            url = span.find('a')['href']
            squad_extract_date = p2.search(span.text).group(1)
            
            # Append to dataframe
            squads = squads.append({
                'tournament_year': tournament_year,
                'tournament_url': tournament_url,
                'team': team,
                'squad_url': 'https://www.cricinfo.com' + url,
                'squad_extract_date': squad_extract_date
            },
                ignore_index=True)

# Format the extraction date
squads['squad_extract_date'] = pd.to_datetime(squads['squad_extract_date'], infer_datetime_format=True)

In [None]:
# Read player information
players = pd.DataFrame(columns=list(squads.columns) + ['player_id', 'age'])

# Loop through each squad
for idx, row in squads.iterrows():
    # Load squad metadata
    tournament_year = row['tournament_year']
    tournament_url = row['tournament_url']
    team = row['team']
    squad_url = row['squad_url']
    squad_extract_date = row['squad_extract_date']
    
    # Read squad list
    soup = get_soup(row['squad_url'])
    player_id = 0
    
    # Loop through players
    for span in soup.find_all('span'):
        if 'Age' in span.text:
            # Extract age
            player_id += 1
            age = span.text.replace('Age: ', '')
            
            # Add to players dataframe
            players = players.append({
                'tournament_year': tournament_year,
                'tournament_url': tournament_url,
                'team': team,
                'squad_url': squad_url,
                'squad_extract_date': squad_extract_date,
                'player_id': player_id,
                'age': age
            }, ignore_index=True)

# Cast age variables as integer
players[['years', 'days']] = players['age'].str.extract(r'(\d+)\D+(\d+)?').fillna(0)
for var in ['years', 'days']:
    players[var] = players[var].astype(int)

# Exclude coaches
players['is_valid'] = np.where(players['years'] <= 20, 1, 0)

# Calculate birth date and birth month 
players['birth_date'] = \
    players['squad_extract_date'] \
    - players['years'].values.astype("timedelta64[Y]") \
    - players['days'].values.astype("timedelta64[D]")
     
players['birth_month'] = players['birth_date'].dt.month

In [None]:
def melt_summary(df):
    # Labels for plot
    labels = {}
    for idx, row in counts.iterrows():
        labels[idx] = idx + ' (n=' + str(row.sum().astype(int)) + ')'
    
    # Melt dataframe
    df_melt = df.reset_index().melt(id_vars='team', value_name='players')
    df_melt['team_label'] = df_melt['team'].apply(lambda x: labels[x])
    return df_melt

# Counts by team and month
counts = players.pivot_table(index='team',
              columns='birth_month',
               values='player_id',
              aggfunc='count').fillna(0)
percs = counts.div(counts.sum(axis=1), axis=0)

# Melt them
counts_melt = melt_summary(counts)
percs_melt = melt_summary(percs)

# Calculate difference to expected percentage
percs_melt['diff_vs_expected'] = percs_melt['players'] - percs_melt['birth_month'].apply(expected_perc)

In [None]:
min_players = 50
teams_to_plot = counts.loc[counts.sum(axis=1) >= min_players].index

In [None]:
plot_df = percs_melt.loc[percs_melt['team'].isin(teams_to_plot), ['team_label', 'birth_month', 'diff_vs_expected']]
plot_df.columns = ['Team', 'Month of Birth', 'Excess % of Players']
    
facet_name = 'Team'

fig = px.bar(plot_df, x='Month of Birth', y='Excess % of Players',
       facet_col=facet_name, facet_col_wrap=3, height=1600, 
             title='Excess % of Players born in each calendar month at U/19 Cricket World Cup (since 2007)')
fig.for_each_annotation(lambda a: a.update(text=a.text.replace(facet_name + '=', '')))
                        
fig.show()

In [None]:
fig.write_html('relative_age_u19.html')

## 2. Australia senior team

In [None]:
# Read player list
soup = get_soup('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;filter=advanced;orderby=matches;qualmin1=20;qualval1=matches;size=200;spanmin1=01+jan+2007;spanval1=span;team=2;template=results;type=batting')
d = {}

# Loop through players
for s in soup.findAll('a', {'class': 'data-link'}):
    # Read player
    p_ref = 'https://www.espncricinfo.com/australia/' + s['href'].replace('/ci/', '')
    p_soup = get_soup(p_ref)
    
    # Extract date of birth
    dob = re.search('(.*,\\s(19|20)\\d\\d)', p_soup.findAll('p', {'class': 'ciPlayerinformationtxt'})[1].find('span').text)[0]
    
    # Add to dictionary
    d[re.search('\\d+', p_ref)[0]] = dob

In [None]:
# Calendar dictionary
cal_dict = dict((v,k) for k,v in enumerate(calendar.month_name))

# Creata data frame of birth dates
aus = pd.DataFrame.from_dict(d, orient='index',
                       columns=['dob'])

# Extract month
aus['dob_month'] = aus['dob'].str.partition(' ')[0]
aus['birth_month'] = aus['dob_month'].map(cal_dict)

# Calculate percentages
aus_percs = aus.groupby('birth_month').size().reset_index()
aus_percs.columns = ['birth_month', 'players']
aus_percs['players_pct']= aus_percs['players'] / aus_percs['players'].sum()

# Calculate difference vs expected
aus_percs['diff_vs_expected'] = aus_percs['players_pct'] - percs_melt['birth_month'].apply(expected_perc)

In [None]:
# Create combined dataframe of U/19 and senior teams
df1 = percs_melt.loc[percs_melt['team'] == 'Australia', ['birth_month', 'diff_vs_expected']]
df1['Team'] = 'Australia U/19'
df2 = aus_percs.loc[:, ['birth_month', 'diff_vs_expected']]
df2['Team'] = 'Australia Senior'
df = pd.concat([df1, df2])
df.columns = ['Month of Birth', 'Excess % of Players', 'Team']

In [None]:
# Plot it
fig = px.bar(df, x='Month of Birth', y='Excess % of Players', color='Team', barmode='group',
            title="Excess % of Players born in each calendar month: Australian men's cricket teams")
fig.update_layout(yaxis_tickformat = '%')

fig.show()

In [None]:
fig.write_html('relative_age_australia.html')