In [1]:
import pandas as pd
from sqlalchemy import create_engine
import openpyxl
from math import isnan
import psycopg2 as pg2

## Extract CSVs and Excel Sheets into DataFrames

#### 1. Extract game.csv for AFL Game Results

In [2]:
game_file = "Resources/games.csv"
game_df = pd.read_csv(game_file)
game_df.head()

Unnamed: 0,gameId,year,round,date,venue,startTime,attendance,homeTeam,homeTeamScore,awayTeam,awayTeamScore,rainfall
0,2021SF02,2021,SF,4-Sep-21,Gabba,7:20 PM,30647,Brisbane Lions,78,Western Bulldogs,79,2.2
1,2021SF01,2021,SF,3-Sep-21,Perth Stadium,5:50 PM,44091,Geelong,103,Greater Western Sydney,68,4.6
2,2021R909,2021,R9,16-May-21,Perth Stadium,2:40 PM,43427,West Coast,106,Adelaide,76,0.0
3,2021R908,2021,R9,16-May-21,M.C.G.,3:20 PM,38581,Melbourne,94,Carlton,68,0.1
4,2021R907,2021,R9,16-May-21,Docklands,1:10 PM,26357,Essendon,68,Fremantle,61,0.1


#### 2. Extract stats.csv for AFL team player's Performance

In [3]:
stats_file = "Resources/stats.csv"
stats_df = pd.read_csv(stats_file)
stats_df.head()

Unnamed: 0,gameId,team,year,round,playerId,displayName,gameNumber,Disposals,Kicks,Marks,...,Brownlow Votes,Contested Possessions,Uncontested Possessions,Contested Marks,Marks Inside 50,One Percenters,Bounces,Goal Assists,% Played,Subs
0,2021R104,Adelaide,2021,R1,2021661124,"Berry, Sam",1,8,6,1,...,0,4,5,0,0,0,0,1,80,-
1,2021R104,Adelaide,2021,R1,2012662083,"Brown, Luke",168,5,2,0,...,0,2,3,0,0,0,0,0,23,Off
2,2021R104,Adelaide,2021,R1,2020665315,"Butts, Jordon",3,10,5,3,...,0,5,5,1,0,8,0,0,93,-
3,2021R104,Adelaide,2021,R1,2018689604,"Doedee, Tom",31,13,9,4,...,0,8,6,0,0,7,0,0,84,-
4,2021R104,Adelaide,2021,R1,2018703883,"Frampton, Billy",9,14,10,8,...,0,5,9,3,4,2,0,0,90,-


#### 3. Extract the sheet1 of AFL_Stadiums.xlsx for Venues

In [4]:
venue_file = "Resources/AFL_Stadiums.xlsx"
venue_df = pd.read_excel(io=venue_file,sheet_name=0,header=3)
venue_df.columns=['venue_name', 'in_use', 'games', 'goals', 'behinds', 'points', 'ave_sore', 'over_100']
venue_df.head()

Unnamed: 0,venue_name,in_use,games,goals,behinds,points,ave_sore,over_100
0,M.C.G.,1897-2022,2984,76566,74346,533742,89.43,2106
1,Princes Park,1897-2005,1277,31318,33362,221270,86.64,854
2,Docklands,2000-2022,1021,28445,23440,194110,95.06,860
3,Victoria Park,1897-1999,880,19679,22139,140213,79.67,443
4,Junction Oval,1897-1984,734,15692,18378,112530,76.66,319


#### 4. Extract the sheet2 of AFL_Stadiums.xlsx for AFL Stadiums

In [5]:
stadium_file = "Resources/AFL_Stadiums.xlsx"
stadium_df = pd.read_excel(io=stadium_file,sheet_name=1,header=2)
stadium_df.columns=['name', 'city_name', 'state_name', 'capacity']
stadium_df.head()

Unnamed: 0,name,city_name,state_name,capacity
0,Adelaide Oval,Adelaide,South Australia,53500.0
1,Alberton Oval,Adelaide,South Australia,11000.0
2,Albury Sports Ground,Albury,New South Wales,8000.0
3,Allinsure Park,Queanbeyan,New South Wales,8000.0
4,Arden Street Oval,Melbourne,Victoria,4000.0


### 5. Extract team venue of AFL_team_venues.csv for team or club

In [6]:
team_file = "Resources/AFL_team_venues.csv"
team_df = pd.read_csv(team_file)
team_df.head(20)

Unnamed: 0,team,ask_as,city,stadiums
0,Adelaide,Adelaide Crows,Adelaide,Adelaide Oval
1,Brisbane Lions,Brisbane Lions,Brisbane,Gabba
2,Carlton,Carlton,Melbourne,Marvel Stadium
3,Collingwood,Collingwood,Melbourne,M.C.G.
4,Essendon,Essendon,Melbourne,Marvel Stadium
5,Fremantle,Fremantle,Perth,Optus Stadium
6,Geelong,Geelong Cats,Geelong,GMHBA Stadium
7,Gold Coast,Gold Coast Suns,Gold Coast,Metricon Stadium
8,Greater Western Sydney,GWS Giants,Sydney,"Giants Stadium, Manuka Oval"
9,Hawthorn,Hawthorn,Melbourne,"M.C.G., UTAS Stadium"


## Transform premise DataFrame

### 1. Transform stats DataFrame to player table

In [7]:
# Create a filtered dataframe from specific columns above 2018 as latest data
stats_transformed = stats_df[stats_df['year'] >= 2018]

player_cols = ["playerId", "displayName"]
player_transformed = stats_df[player_cols].copy()

# split First name and Last name from full name 
player_name = player_transformed['displayName'].str.split(',', expand=True)
player_transformed['first_name']=player_name[1]
player_transformed['last_name']=player_name[0]

player_transformed.drop('displayName',axis=1,inplace=True)

# Rename the column headers
player_transformed = player_transformed.rename(columns={"playerId": "player_id"})

# Clean the data by dropping duplicates and setting the index
player_transformed.drop_duplicates("player_id", inplace=True)
player_transformed.set_index("player_id", inplace=True)

player_transformed.head()

Unnamed: 0_level_0,first_name,last_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2021661124,Sam,Berry
2012662083,Luke,Brown
2020665315,Jordon,Butts
2018689604,Tom,Doedee
2018703883,Billy,Frampton


### 2. Transform stadium DataFrame to city table

In [8]:
# Create a filtered dataframe from specific columns we need
city_cols = ['city_name', 'state_name']
city_transformed = stadium_df[city_cols].copy()

# Clean the data by dropping duplicates and setting the index
city_transformed.drop_duplicates("city_name", inplace=True)

#set auto-increment ID as city_id" column with range function by lenth 
city_transformed['city_id'] = range(1,len(city_transformed)+1)
#set index for "city_id"
city_transformed.set_index("city_id", inplace=True)

city_transformed.head()

Unnamed: 0_level_0,city_name,state_name
city_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adelaide,South Australia
2,Albury,New South Wales
3,Queanbeyan,New South Wales
4,Melbourne,Victoria
5,Ballarat,Victoria


### 3.Transform stats DataFrame to team table

In [9]:

# Create a filtered dataframe from specific columns "team"
team_cols = ['team', 'stadiums']
team_transformed = team_df[team_cols].copy()
team_transformed = team_transformed.reset_index()

# set auto-increment ID as team_id
team_transformed['team_id'] = range(1,len(team_transformed)+1)
# Rename the column headers
team_transformed.rename(columns = {'team':'name'}, inplace = True)
# Clean the data by dropping duplicates and setting the index
team_transformed.set_index("team_id", inplace=True)
team_transformed.drop('index',axis=1,inplace=True)

team_transformed.head(20)

Unnamed: 0_level_0,name,stadiums
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adelaide,Adelaide Oval
2,Brisbane Lions,Gabba
3,Carlton,Marvel Stadium
4,Collingwood,M.C.G.
5,Essendon,Marvel Stadium
6,Fremantle,Optus Stadium
7,Geelong,GMHBA Stadium
8,Gold Coast,Metricon Stadium
9,Greater Western Sydney,"Giants Stadium, Manuka Oval"
10,Hawthorn,"M.C.G., UTAS Stadium"


### 4. Transform stadium DataFrame to stadium table

In [10]:

#replacae the name for the same stadium with abbreviation
stadium_df.replace('Melbourne Cricket Ground','M.C.G.', inplace=True)
stadium_df.replace('Sydney Cricket Ground','S.C.G.', inplace=True)
stadium_df.replace('Jiangwan Stadium (CHN)','Jiangwan Stadium', inplace=True)

#get unique names with unique function and union with the stadium name in stadium df and venue df in the same list
stadium_name = set(game_df["venue"].unique().tolist()).union(set(stadium_df['name'].unique().tolist())).union(set(venue_df['venue_name'].unique().tolist()))

stadium_name = pd.DataFrame(list(stadium_name),columns=['name'])
stadium_name['stadium_id'] = range(1,len(stadium_name)+1)


venue_cols = ['venue_name', 'in_use']
venue = venue_df[venue_cols].copy()
venue.rename(columns = {'venue_name':'name'}, inplace = True)

# split the in_use data into start year data as int and end year data as int
venue['start_year'] = venue['in_use'].astype(str).str[0:4].astype(int)
venue['end_year'] = venue['in_use'].astype(str).str[-4:].astype(int)

#left join with dataframe 'stadium_name'
venue = pd.merge(stadium_name, venue, on='name', how='left')
# venue.fillna(0, inplace=True)
venue.sort_values('end_year', ascending=False, inplace=True)

venue.head()


Unnamed: 0,name,stadium_id,in_use,start_year,end_year
107,M.C.G.,108,1897-2022,1897.0,2022.0
88,Stadium Australia,89,2002-2022,2002.0,2022.0
101,Perth Stadium,102,2018-2022,2018.0,2022.0
112,Eureka Stadium,113,2017-2022,2017.0,2022.0
71,Bellerive Oval,72,2012-2022,2012.0,2022.0


In [11]:

stadium_transformed = pd.merge(stadium_df, venue, on='name', how='left')

# to connect same "city_name" in city_transformed and "stadium_transformed" to get the city_id 

def get_city_id(x):
    city_rows = city_transformed.loc[city_transformed['city_name'] == x]
    if len(city_rows)>0: # if the len greater than 0 , return the index as the city_id
        return city_rows.index.values[0]
    else:
        return ''
# stadium_transformed = stadium_df
stadium_transformed['city_id'] = stadium_df['city_name'].map(get_city_id)

#  set criteria as if "end_year" is < 2022,return the boolean value true or false the stadium is active

def is_stadium_active(x):
    if x < 2022:
        return False
    elif isnan(x):
        return False
    else:
        return True
stadium_transformed['active_ind'] = stadium_transformed['end_year'].map(is_stadium_active)#use map to link active_ind back to the end_year list from stadium_transformed


stadium_transformed = stadium_transformed[['stadium_id', 'name', 'city_id', 'start_year', 'end_year', 'capacity', 'active_ind']]

#drop the duplicates and set the index
stadium_transformed.drop_duplicates("stadium_id", inplace=True)
stadium_transformed.set_index("stadium_id", inplace=True)
stadium_transformed.sort_values(by=['end_year', 'city_id'], ascending=[False, True], inplace=True)
# stadium_transformed.dropna(inplace=True)
stadium_transformed.head()

Unnamed: 0_level_0,name,city_id,start_year,end_year,capacity,active_ind
stadium_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
66,Adelaide Oval,1,2011.0,2022.0,53500.0,True
108,M.C.G.,4,1897.0,2022.0,100024.0,True
126,S.C.G.,6,1903.0,2022.0,48000.0,True
41,Gabba,11,1981.0,2022.0,41974.0,True
148,Manuka Oval,29,1998.0,2022.0,16000.0,True


### 5.Transform stats DataFrame adding player performance to Game Player Status table

In [12]:

# Create a filtered dataframe from specific columns above 2018 as latest data
stats_cols = ["gameId", "year", "team", "playerId", "Rebounds","Inside 50s","Clearances","Contested Possessions"]
stats_transformed = stats_df[stats_df['year'] >= 2018][stats_cols].copy()

# Rename the column headers
stats_transformed = stats_transformed.rename(columns={"gameId": "game_id",
                                                        "year": "year",
                                                        "playerId": "player_id",
                                                        "Rebounds": "rebounds",
                                                        "Clearances": "clearances",
                                                        "Inside 50s": "inside_50s",
                                                        "Contested Possessions": "contested_possessions"
                                                     })
# by loc function, to find the 'team_name' from team_transformed df and find its lenth of row and set the index as team_id
def get_team_id(x):
    team_rows = team_transformed.loc[team_transformed['name'] == x]
    if len(team_rows)>0:
        return team_rows.index.values[0]
    else:
        return ''
stats_transformed['team_id'] = stats_transformed['team'].map(get_team_id)

stats_transformed.drop('team', axis=1, inplace=True)
# Clean the data by dropping duplicates and setting the index
stats_transformed.drop_duplicates("game_id", inplace=True)
stats_transformed.set_index("game_id", inplace=True)

stats_transformed.tail()


Unnamed: 0_level_0,year,player_id,rebounds,inside_50s,clearances,contested_possessions,team_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018R109,2018,2012674793,0,1,1,7,16
2018R404,2018,2016655002,0,2,0,2,16
2018R1302,2018,2016655002,4,1,0,5,16
2018R207,2018,2015662872,2,1,0,6,17
2018R1808,2018,2016676071,0,0,0,2,17


### 6. Transform game DataFrame to game table

In [13]:
#select the data from dataframe above 2018 as latest data, we skip data of 2021&2022 due to data incomplete in covid19 
game_transformed = game_df[game_df['year'] >= 2018]

# Rename the column headers
game_transformed = game_transformed.rename(columns={"gameId": "game_id",
                                                        "startTime": "start_time",
                                                        "homeTeamScore": "home_team_score",
                                                        "awayTeamScore": "away_team_score",
                                                        "rainfall": "rain_fall"
                                                     })
#use 'homeTeam' to map to connect with 'home_team_id'
game_transformed['home_team_id'] = game_transformed['homeTeam'].map(get_team_id)
game_transformed['away_team_id'] = game_transformed['awayTeam'].map(get_team_id)
game_transformed.drop('homeTeam', axis=1, inplace=True)
game_transformed.drop('awayTeam', axis=1, inplace=True)

#left merge the game_transformend df with the venue
game_transformed = pd.merge(game_transformed, venue, left_on='venue', right_on='name', how='left')
game_transformed.drop('venue', axis=1, inplace=True)
game_transformed.drop('start_year', axis=1, inplace=True)
game_transformed.drop('end_year', axis=1, inplace=True)
game_transformed.drop('in_use', axis=1, inplace=True)

#str type transfer to date type
game_transformed['date'] = pd.to_datetime(game_transformed['date'], format='%d-%b-%y')
# str type transfered to 24-hr time type
game_transformed['start_time'] = pd.to_datetime(game_transformed['start_time']).dt.strftime('%H:%M')
# drop the duplicates and set index
game_transformed.drop_duplicates("game_id", inplace=True)
game_transformed.set_index("game_id", inplace=True)


game_transformed.head()

Unnamed: 0_level_0,year,round,date,start_time,attendance,home_team_score,away_team_score,rain_fall,home_team_id,away_team_id,name,stadium_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021SF02,2021,SF,2021-09-04,19:20,30647,78,79,2.2,2,18,Gabba,41
2021SF01,2021,SF,2021-09-03,17:50,44091,103,68,4.6,7,9,Perth Stadium,102
2021R909,2021,R9,2021-05-16,14:40,43427,106,76,0.0,17,1,Perth Stadium,102
2021R908,2021,R9,2021-05-16,15:20,38581,94,68,0.1,11,3,M.C.G.,108
2021R907,2021,R9,2021-05-16,13:10,26357,68,61,0.1,5,6,Docklands,116


### 7. Transform TVS DataFrame with team_id, TVS, year 2020

In [23]:
# tvs extract and calucate data only for year 2020
tvs_df1 = stats_transformed[stats_transformed['year'] == 2020]
tvs_df1

Unnamed: 0_level_0,year,player_id,rebounds,inside_50s,clearances,contested_possessions,team_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020R104,2020,2015652891,2,1,0,8,1
2020R205,2020,2015652891,1,4,1,4,1
2020R307,2020,2015652891,0,0,0,1,1
2020R407,2020,2012662083,8,0,0,4,1
2020R507,2020,2012662083,2,1,0,1,1
...,...,...,...,...,...,...,...
2020R1701,2020,2017662715,2,1,0,2,15
2020EF01,2020,2014668660,1,0,1,6,15
2020R401,2020,2016655002,2,3,0,3,16
2020R503,2020,2016655002,1,0,2,3,16


In [24]:
# Sum each four factors base on team_id
tvs_df2 = tvs_df1.groupby(['team_id', 'year']).agg({'rebounds':'sum','inside_50s':'sum', 'clearances':'sum','contested_possessions':'sum'})

# Calculate total vital statistics by total all four factors base on team_id
tvs_df2['tvs'] = tvs_df2.iloc[:,0:4].sum(axis=1)
tvs_df2.reset_index(inplace = True)

# Save tvs data
tvs_cols = ["team_id", "year", "tvs"]
tvs_transformed = tvs_df2[tvs_cols].copy()

TVS_transformed=tvs_transformed.sort_values(['tvs'],ascending=False)
TVS_transformed

Unnamed: 0,team_id,year,tvs
3,4,2020,313
1,2,2020,156
0,1,2020,138
2,3,2020,125
6,7,2020,124
5,6,2020,122
7,8,2020,110
4,5,2020,92
11,12,2020,88
10,11,2020,76


#### Export transformed csv files for checking data quality

In [25]:
player_transformed.to_csv('outputs/player_transformed_result.csv',index=0)
stadium_transformed.to_csv('outputs/stadium_transformed_result.csv',index=0)
city_transformed.to_csv('outputs/city_transformed_result.csv',index=0)
team_transformed.to_csv('outputs/team_transformed_result.csv',index=0)
game_transformed.to_csv('outputs/game_transformed_result.csv',index=0)
stats_transformed.to_csv('outputs/stats_transformed_result.csv',index=0)
tvs_transformed.to_csv('outputs/tvs_transformed_result.csv',index=0)

## Create database connection

In [26]:
rsd_connection_string = "postgres:Claudia@localhost:5432/AFLGame_db"

In [27]:
engine = create_engine(f'postgresql://{rsd_connection_string}')

In [28]:
connection_string = "postgres:Claudia@localhost:5432/AFLGame_db"
engine = create_engine(f'postgresql://{connection_string}')

## Load DataFrames into database

In [21]:
# check table names in DB 
engine.table_names()

  engine.table_names()


OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  password authentication failed for user "postgres"

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
# tiding the table 
player_transformed

In [None]:
player_transformed.reset_index(inplace = True)

In [None]:
#tiding the table 
player_transformed

In [None]:
#uploading data to DB 
player_transformed.to_sql('player', engine)

In [None]:
stadium_transformed

In [None]:
stadium_transformed.reset_index(inplace = True)
stadium_transformed

In [None]:
#uploading data to DB 
stadium_transformed.to_sql('stadium', engine)

In [None]:
city_transformed.reset_index(inplace = True)
city_transformed

In [None]:
#uploading data to DB 
city_transformed.to_sql('city', engine)

In [None]:
team_transformed.reset_index(inplace = True)
team_transformed

In [None]:
#uploading data to DB 
team_transformed.to_sql('team', engine)

In [None]:
game_transformed.reset_index(inplace = True)
game_transformed

In [None]:
#uploading data to DB 
game_transformed.to_sql('game', engine)

In [None]:
stats_transformed.reset_index(inplace = True)
stats_transformed

In [None]:
#uploading data to DB 
stats_transformed.to_sql('stats', engine)

In [None]:
tvs_transformed.reset_index(inplace = True)
tvs_transformed

In [None]:
#uploading data to DB 
tvs_transformed.to_sql('tvs', engine)