In [12]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

* I will only create csv files for years 2015 until and including 2021. This is because metres gained data is not available before 2015.

In [13]:
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021]

* Function to return the full url given a year and a stat.

In [14]:
def get_url(year, stat):
    body = "https://www.footywire.com/afl/footy/ft_player_rankings?year=" + str(year) + "&rt=LA&st=" + stat
    return body

* Method to create a pandas DataFrame given a URL and a statistic 

In [15]:
def create_table(year, stat):
    url = get_url(year, stat)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    table = soup.find('table', 'width' == 688).find_all('table', 'border'== 0)[8]
    rows = table.find_all('tr')
    
    names_list = []
    teams_list = []
    stats_list = []
    # number of games played
    games_list = []
    for row in rows[1:]:
        name = row.find_all('td')[1].text
        team = row.find_all('td')[2].text
        games_played = int(row.find_all('td')[3].text)
        names_list.append(name)
        teams_list.append(team)
        games_list.append(games_played)
        # Tables on footywire are formatted differently for the current season.
        if int(year) == 2022:
            try:
                stat_value = float(row.find_all('td')[5].text)
            except ValueError:
                
                stat_value = 0
        else:
            
            try:
                stat_value = float(row.find_all('td')[4].text)
            except ValueError:
                stat_value = 0
        
        stats_list.append(stat_value)
    
    
    d = {"Name": names_list, "Team": teams_list, "Games Played": games_list, reversed_dictionary[stat]: stats_list}
    
    df = pd.DataFrame(data=d)
    
    if (year == 2022) :
        df_filtered = df[df["Games Played"]>1]
    else:
        df_filtered = df[df["Games Played"]>5]
    
    return df_filtered
        


In [16]:
dictionary = {
    "Kicks": "KI",
    "Handballs": "HB",
    "Disposals": "DI",
    "Marks": "MA",
    "Goals": "GO",
    "Behinds": "BE",
    "Tackles": "TA",
    "Hitouts": "HO",
    "Inside 50s": "I5",
    "Goal Assists": "GA",
    "Contested Possessions": "CP",
    "Uncontested Possessions": "UP",
    "Effective Disposals": "ED",
    "Disposal Efficiency": "EP",
    "Clangers": "CG",
    "Contested Marks": "CM",
    "Marks Inside 50": "M5",
    "Clearances": "CL",
    "Centre Clearances": "CC",
    "Stoppage Clearances": "SP",
    "Rebound 50s": "R5",
    "One Percenters": "1P",
    "Bounces": "BO",
    "Metres Gained": "MG",
    "Score Involvements": "SI",
    "Turnovers": "TO",
    "Intercepts": "IT",
    "Tackles Inside 50": "T5",
    "Time On Ground": "TG"
}

In [17]:
reversed_dictionary = {value : key for (key, value) in dictionary.items()}

* Recursively merges every statistic DataFrame for a specific year and returns the final DataFrame

In [18]:
def create_year_table(year, arr):
    # base case: if the size of arr is 1, we return the data frame
    if(len(arr)==1):
        returnable = create_table(year,arr[0])
        return returnable
    # else we split into two
    length = len(arr)
    left_df = create_year_table(year, arr[0:int(len(arr)/2)])
    right_df = create_year_table(year, arr[int(len(arr)/2): len(arr)])
    # now we merge
    final_df = left_df.merge(right_df, how="outer", on=["Name", "Team", "Games Played"])
    return final_df

In [12]:
# export to csv
for year in years:
    # create a dataframe
    df = create_year_table(year, list(dictionary.values()))
    df.fillna(0, inplace=True)
    file_name = str(year) + "_stats.csv"
    zip_name = str(year) + ".zip"
    compression_opts = dict(method='zip',
                        archive_name=file_name)  
    df.to_csv(zip_name, index=False,
              compression=compression_opts)
    print("File for " + str(year) + " created!")

File for 2015 created!
File for 2016 created!
File for 2017 created!
File for 2018 created!
File for 2019 created!
File for 2020 created!
File for 2021 created!


In [19]:
# create 2022 table
def create_2022_table():
    df = create_year_table(2022, list(dictionary.values()))
    df.fillna(0, inplace=True)
    file_name = str(2022) + "_stats.csv"
    zip_name = str(2022) + ".zip"
    compression_opts = dict(method='zip',
                        archive_name=file_name)  
    df.to_csv(zip_name, index=False,
              compression=compression_opts)
    print("File for " + str(2022) + " created!")

In [20]:
create_2022_table()

File for 2022 created!


In [21]:
dictionary2 = {
    "Kicks": "KI",
    "Handballs": "HB",
    "Disposals": "DI",
    "Marks": "MA",
    "Goals": "GO",
    "Behinds": "BE",
    "Tackles": "TA",
    "Hitouts": "HO",
    "Inside 50s": "I5",
    "Goal Assists": "GA",
    "Contested Possessions": "CP",
    "Uncontested Possessions": "UP",
    "Effective Disposals": "ED",
    "Disposal Efficiency": "EP",
    "Clangers": "CG",
    "Contested Marks": "CM",
    "Marks Inside 50": "M5",
    "Clearances": "CL",
    "Centre Clearances": "CC",
    "Stoppage Clearances": "SP",
    "Rebound 50s": "R5",
    "One Percenters": "1P",
    "Bounces": "BO",
    "Score Involvements": "SI",
    "Turnovers": "TO",
    "Intercepts": "IT",
    "Tackles Inside 50": "T5",
    "Time On Ground": "TG"
}

In [None]:
years = []