In [3]:
import pandas as pd
from sqlalchemy import create_engine

## Extract CSVs and Excel Sheets into DataFrames

#### 1. Extract game.csv for AFL Game Results

In [4]:
game_file = "games.csv"
game_df = pd.read_csv(game_file)
game_df.head()

Unnamed: 0,gameId,year,round,date,venue,startTime,attendance,homeTeam,homeTeamScore,awayTeam,awayTeamScore,rainfall
0,2021R101,2021,R1,18-Mar-2021,M.C.G.,7:25 PM,49218,Richmond,105,Carlton,80,0.0
1,2021R102,2021,R1,19-Mar-2021,M.C.G.,7:50 PM,46051,Collingwood,53,Western Bulldogs,69,0.0
2,2021R103,2021,R1,20-Mar-2021,M.C.G.,1:45 PM,21365,Melbourne,80,Fremantle,58,0.0
3,2021R104,2021,R1,20-Mar-2021,Adelaide Oval,4:05 PM,26985,Adelaide,103,Geelong,91,0.0
4,2021R105,2021,R1,20-Mar-2021,Docklands,7:25 PM,25128,Essendon,91,Hawthorn,92,0.0


#### 2. Extract status.csv for AFL team player's Performance Report

In [5]:
stats_file = "stats.csv"
stats_df = pd.read_csv(stats_file)
stats_df.head()

Unnamed: 0,gameId,team,year,round,playerId,displayName,gameNumber,Disposals,Kicks,Marks,...,Brownlow Votes,Contested Possessions,Uncontested Possessions,Contested Marks,Marks Inside 50,One Percenters,Bounces,Goal Assists,% Played,Subs
0,2021R104,Adelaide,2021,R1,2021661124,"Berry, Sam",1,8,6,1,...,0,4,5,0,0,0,0,1,80,-
1,2021R104,Adelaide,2021,R1,2012662083,"Brown, Luke",168,5,2,0,...,0,2,3,0,0,0,0,0,23,Off
2,2021R104,Adelaide,2021,R1,2020665315,"Butts, Jordon",3,10,5,3,...,0,5,5,1,0,8,0,0,93,-
3,2021R104,Adelaide,2021,R1,2018689604,"Doedee, Tom",31,13,9,4,...,0,8,6,0,0,7,0,0,84,-
4,2021R104,Adelaide,2021,R1,2018703883,"Frampton, Billy",9,14,10,8,...,0,5,9,3,4,2,0,0,90,-


#### 3. Extract the sheet1 of AFL_Stadiums.xlsx for Venues

In [6]:
venue_file = "AFL_Stadiums.xlsx"
venue_df = pd.read_excel(io=venue_file,sheet_name=0,header=3)
venue_df.columns=['venue_name', 'in_use', 'games', 'goals', 'behinds', 'points', 'ave_sore', 'over_100']
venue_df.head()

Unnamed: 0,venue_name,in_use,games,goals,behinds,points,ave_sore,over_100
0,M.C.G.,1897-2022,2984,76566,74346,533742,89.43,2106
1,Princes Park,1897-2005,1277,31318,33362,221270,86.64,854
2,Docklands,2000-2022,1021,28445,23440,194110,95.06,860
3,Victoria Park,1897-1999,880,19679,22139,140213,79.67,443
4,Junction Oval,1897-1984,734,15692,18378,112530,76.66,319


#### 4. Extract the sheet2 of AFL_Stadiums.xlsx for AFL Stadiums

In [7]:
stadium_file = "AFL_Stadiums.xlsx"
stadium_df = pd.read_excel(io=stadium_file,sheet_name=1,header=2)
stadium_df.columns=['stadium_name', 'city_name', 'state_name', 'capacity']
stadium_df.head()

Unnamed: 0,stadium_name,city_name,state_name,capacity
0,Adelaide Oval,Adelaide,South Australia,53500.0
1,Alberton Oval,Adelaide,South Australia,11000.0
2,Albury Sports Ground,Albury,New South Wales,8000.0
3,Allinsure Park,Queanbeyan,New South Wales,8000.0
4,Arden Street Oval,Melbourne,Victoria,4000.0


## Transform premise DataFrame

### 1. Transform stats DataFrame to player table

In [8]:
player_cols = ["playerId", "displayName"]
player_transformed = stats_df[player_cols].copy()

# split First name and Last name from full name 
player_name = player_transformed['displayName'].str.split(',', expand=True)
player_transformed['first_name']=player_name[0]
player_transformed['last_name']=player_name[1]

player_transformed.drop('displayName',axis=1,inplace=True)

# Rename the column headers
player_transformed = player_transformed.rename(columns={"playerId": "player_id"})

# Clean the data by dropping duplicates and setting the index
player_transformed.drop_duplicates("player_id", inplace=True)
player_transformed.set_index("player_id", inplace=True)

player_transformed.head()

Unnamed: 0_level_0,first_name,last_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2021661124,Berry,Sam
2012662083,Brown,Luke
2020665315,Butts,Jordon
2018689604,Doedee,Tom
2018703883,Frampton,Billy


### 2.Transform stats DataFrame to team table

In [9]:
team_transformed = stats_df["team"].copy()
team_transformed.drop_duplicates(inplace=True)

team_transformed = team_transformed.reset_index()

# set auto-increment ID as team_id
team_transformed['team_id'] = range(1,len(team_transformed)+1)

team_transformed.rename(columns = {'team':'team_name'}, inplace = True)
team_transformed.set_index("team_id", inplace=True)
team_transformed.drop('index',axis=1,inplace=True)

team_transformed.head()

Unnamed: 0_level_0,team_name
team_id,Unnamed: 1_level_1
1,Adelaide
2,Brisbane Lions
3,Carlton
4,Collingwood
5,Essendon


### 3. Transform stadium DataFrame to city table

In [10]:
city_cols = ['city_name', 'state_name']
city_transformed = stadium_df[city_cols].copy()

city_transformed.drop_duplicates("city_name", inplace=True)
city_transformed['city_id'] = range(1,len(city_transformed)+1)

city_transformed.set_index("city_id", inplace=True)

city_transformed.head()

Unnamed: 0_level_0,city_name,state_name
city_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adelaide,South Australia
2,Albury,New South Wales
3,Queanbeyan,New South Wales
4,Melbourne,Victoria
5,Ballarat,Victoria


In [11]:
venue_cols = ['venue_name', 'in_use']
venue = venue_df[venue_cols].copy()

venue['start_year'] = venue['in_use'].astype(str).str[0:4].astype(int)
venue['end_year'] = venue['in_use'].astype(str).str[-4:].astype(int)
venue['stadium_id'] = range(1,len(venue)+1)

# venue.set_index("venue_id", inplace=True)

venue.head()

Unnamed: 0,venue_name,in_use,start_year,end_year,stadium_id
0,M.C.G.,1897-2022,1897,2022,1
1,Princes Park,1897-2005,1897,2005,2
2,Docklands,2000-2022,2000,2022,3
3,Victoria Park,1897-1999,1897,1999,4
4,Junction Oval,1897-1984,1897,1984,5


### 4. Transform stadium DataFrame

In [12]:
stadium_df.replace('Melbourne Cricket Ground','M.C.G.', inplace=True)
stadium_df.replace('Sydney Cricket Ground','S.C.G.', inplace=True)


stadium_transformed = pd.merge(stadium_df, venue, left_on='stadium_name', right_on='venue_name')


def get_city_id(x):
    city_rows = city_transformed.loc[city_transformed['city_name'] == x]
    if len(city_rows)>0:
        return city_rows.index.values[0]
    else:
        return ''
stadium_transformed['city_id'] = stadium_transformed['city_name'].map(get_city_id)


def is_stadium_active(x):
    if x < 2022:
        return False
    else:
        return True
stadium_transformed['active_ind'] = stadium_transformed['end_year'].map(is_stadium_active)


stadium_transformed.drop('city_name', axis=1, inplace=True)
stadium_transformed.drop('state_name', axis=1, inplace=True)
stadium_transformed.drop('venue_name', axis=1, inplace=True)
stadium_transformed.drop('in_use', axis=1, inplace=True)
# stadium_transformed.drop('start_year', axis=1, inplace=True)
# stadium_transformed.drop('end_year', axis=1, inplace=True)


stadium_transformed.drop_duplicates("stadium_id", inplace=True)
stadium_transformed.set_index("stadium_id", inplace=True)



stadium_transformed.head()

Unnamed: 0_level_0,stadium_name,capacity,start_year,end_year,city_id,active_ind
stadium_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23,Adelaide Oval,53500.0,2011,2022,1,True
18,Gabba,41974.0,1981,2022,11,True
16,Glenferrie Oval,10000.0,1925,1973,4,False
29,Manuka Oval,16000.0,1998,2022,29,True
1,M.C.G.,100024.0,1897,2022,4,True


### 5.Transform stats DataFrame adding player performance

In [13]:
stats_cols = ["gameId", "team", "playerId", "Rebounds","Inside 50s","Clearances","Contested Possessions"]
stats_transformed = stats_df[stats_df['year'] >= 2018][stats_cols].copy()


stats_transformed = stats_transformed.rename(columns={"gameId": "game_id",
                                                        "playerId": "player_id",
                                                        "Rebounds": "rebound",
                                                        "Clearances": "clearance",
                                                        "Inside 50s": "inside_50s",
                                                        "Contested Possessions": "contested_position",
                                                     })

def get_team_id(x):
    team_rows = team_transformed.loc[team_transformed['team_name'] == x]
    if len(team_rows)>0:
        return team_rows.index.values[0]
    else:
        return ''
stats_transformed['team_id'] = stats_transformed['team'].map(get_team_id)


stats_transformed['performance'] = stats_transformed[["rebound","inside_50s","clearance","contested_position"]].sum(axis=1)

stats_transformed.drop('team', axis=1, inplace=True)

stats_transformed.drop_duplicates("game_id", inplace=True)
stats_transformed.set_index("game_id", inplace=True)

stats_transformed.tail()


Unnamed: 0_level_0,player_id,rebound,inside_50s,clearance,contested_position,team_id,performance
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018R109,2012674793,0,1,1,7,16,9
2018R404,2016655002,0,2,0,2,16,4
2018R1302,2016655002,4,1,0,5,16,10
2018R207,2015662872,2,1,0,6,17,9
2018R1808,2016676071,0,0,0,2,17,2


### 6. Transform game DataFrame

In [14]:
game_transformed = game_df[game_df['year'] >= 2018]


game_transformed = game_transformed.rename(columns={"gameId": "game_id",
                                                        "startTime": "start_time",
                                                        "homeTeamScore": "home_team_score",
                                                        "awayTeamScore": "away_team_score",
                                                        "rainfall": "rain_fall"
                                                     })

game_transformed['home_team_id'] = game_transformed['homeTeam'].map(get_team_id)
game_transformed['away_team_id'] = game_transformed['awayTeam'].map(get_team_id)
game_transformed.drop('homeTeam', axis=1, inplace=True)
game_transformed.drop('awayTeam', axis=1, inplace=True)


game_transformed = pd.merge(game_transformed, venue, left_on='venue', right_on='venue_name')
game_transformed.drop('venue', axis=1, inplace=True)
game_transformed.drop('start_year', axis=1, inplace=True)
game_transformed.drop('end_year', axis=1, inplace=True)
game_transformed.drop('in_use', axis=1, inplace=True)


game_transformed['date'] = pd.to_datetime(game_transformed['date'], format='%d-%b-%Y')
game_transformed['start_time'] = pd.to_datetime(game_transformed['start_time']).dt.strftime('%H:%M')

game_transformed.drop_duplicates("game_id", inplace=True)
game_transformed.set_index("game_id", inplace=True)


game_transformed.head()

Unnamed: 0_level_0,year,round,date,start_time,attendance,home_team_score,away_team_score,rain_fall,home_team_id,away_team_id,venue_name,stadium_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021R101,2021,R1,2021-03-18,19:25,49218,105,80,0.0,14,3,M.C.G.,1
2021R102,2021,R1,2021-03-19,19:50,46051,53,69,0.0,4,18,M.C.G.,1
2021R103,2021,R1,2021-03-20,13:45,21365,80,58,0.0,11,6,M.C.G.,1
2021R201,2021,R2,2021-03-25,19:20,51723,85,106,3.8,3,4,M.C.G.,1
2021R207,2021,R2,2021-03-28,13:10,41051,49,78,0.1,10,14,M.C.G.,1


## Create database connection

## Load DataFrames into database