In [None]:
import pandas as pd
from sqlalchemy import create_engine
import openpyxl
from math import isnan

In [None]:
!pip install psycopg2

In [None]:
import psycopg2 as pg2

Extract CSVs and Excel Sheets into DataFrames
1. Extract game.csv for AFL Game Results

In [None]:
game_file = "games.csv"
game_df = pd.read_csv(game_file)
game_df.head()

2. Extract status.csv for AFL team player's Performance Report

In [None]:
stats_file = "stats.csv"
stats_df = pd.read_csv(stats_file)
stats_df.head()

3. Extract the sheet1 of AFL_Stadiums.xlsx for Venues

In [None]:
venue_file = "AFL_Stadiums.xlsx"
venue_df = pd.read_excel(io=venue_file,sheet_name=0,header=3)
venue_df.columns=['venue_name', 'in_use', 'games', 'goals', 'behinds', 'points', 'ave_sore', 'over_100']
venue_df.head()

4. Extract the sheet2 of AFL_Stadiums.xlsx for AFL Stadiums

In [None]:
stadium_file = "AFL_Stadiums.xlsx"
stadium_df = pd.read_excel(io=stadium_file,sheet_name=1,header=2)
stadium_df.columns=['name', 'city_name', 'state_name', 'capacity']
stadium_df.head()

In [None]:
# Create a filtered dataframe from specific columns
player_cols = ["playerId", "displayName"]
player_transformed = stats_df[player_cols].copy()

# split First name and Last name from full name 
player_name = player_transformed['displayName'].str.split(',', expand=True)
player_transformed['first_name']=player_name[1]
player_transformed['last_name']=player_name[0]

player_transformed.drop('displayName',axis=1,inplace=True)

# Rename the column headers
player_transformed = player_transformed.rename(columns={"playerId": "player_id"})

# Clean the data by dropping duplicates and setting the index
player_transformed.drop_duplicates("player_id", inplace=True)
player_transformed.set_index("player_id", inplace=True)

player_transformed.head()

In [None]:
# Create a filtered dataframe from specific columns "team"
team_transformed = stats_df["team"].copy()
team_transformed.drop_duplicates(inplace=True)

team_transformed = team_transformed.reset_index()

# set auto-increment ID as team_id
team_transformed['team_id'] = range(1,len(team_transformed)+1)
# Rename the column headers
team_transformed.rename(columns = {'team':'team_name'}, inplace = True)
# Clean the data by dropping duplicates and setting the index
team_transformed.set_index("team_id", inplace=True)
team_transformed.drop('index',axis=1,inplace=True)

team_transformed.head(20)

In [None]:
# Create a filtered dataframe from specific columns
city_cols = ['city_name', 'state_name']
city_transformed = stadium_df[city_cols].copy()

# Clean the data by dropping duplicates and setting the index
city_transformed.drop_duplicates("city_name", inplace=True)

#set auto-increment ID as city_id" column with range function by lenth 
city_transformed['city_id'] = range(1,len(city_transformed)+1)
#set index for "city_id"
city_transformed.set_index("city_id", inplace=True)

city_transformed.head()

In [None]:
#replacae the name for the same stadium with abbreviation
stadium_df.replace('Melbourne Cricket Ground','M.C.G.', inplace=True)
stadium_df.replace('Sydney Cricket Ground','S.C.G.', inplace=True)
stadium_df.replace('Jiangwan Stadium (CHN)','Jiangwan Stadium', inplace=True)

#get unique names with unique function and union with the stadium name in stadium df and venue df in the same list
stadium_name = set(game_df["venue"].unique().tolist()).union(set(stadium_df['name'].unique().tolist())).union(set(venue_df['venue_name'].unique().tolist()))

stadium_name = pd.DataFrame(list(stadium_name),columns=['name'])
stadium_name['stadium_id'] = range(1,len(stadium_name)+1)


venue_cols = ['venue_name', 'in_use']
venue = venue_df[venue_cols].copy()
venue.rename(columns = {'venue_name':'name'}, inplace = True)

# split the in_use data into start year data as int and end year data as int
venue['start_year'] = venue['in_use'].astype(str).str[0:4].astype(int)
venue['end_year'] = venue['in_use'].astype(str).str[-4:].astype(int)

#left join with dataframe 'stadium_name'
venue = pd.merge(stadium_name, venue, on='name', how='left')
# venue.fillna(0, inplace=True)
venue.sort_values('end_year', ascending=False, inplace=True)

venue.head()

In [None]:
stadium_transformed = pd.merge(stadium_df, venue, on='name', how='outer')


# to connect same "city_name" in city_transformed and "stadium_transformed" to get the city_id 

def get_city_id(x):
    city_rows = city_transformed.loc[city_transformed['city_name'] == x]
    if len(city_rows)>0: # if the len greater than 0 , return the index as the city_id
        return city_rows.index.values[0]
    else:
        return ''
stadium_transformed['city_id'] = stadium_transformed['city_name'].map(get_city_id)

#  set criteria as if "end_year" is < 2022,return the boolean value true or false the stadium is active

def is_stadium_active(x):
    if x < 2022:
        return False
    elif isnan(x):
        return False
    else:
        return True
stadium_transformed['active_ind'] = stadium_transformed['end_year'].map(is_stadium_active)#use map to link active_ind back to the end_year list from stadium_transformed


stadium_transformed = stadium_transformed[['stadium_id', 'name', 'city_id', 'start_year', 'end_year', 'capacity', 'active_ind']]

#drop the duplicates and set the index
stadium_transformed.drop_duplicates("stadium_id", inplace=True)
stadium_transformed.set_index("stadium_id", inplace=True)
stadium_transformed.sort_values(by=['end_year', 'city_id'], ascending=[False, True], inplace=True)
stadium_transformed.dropna(inplace=True)
stadium_transformed.head(50)

In [None]:
# Create a filtered dataframe from specific columns
stats_cols = ["gameId", "team", "playerId", "Rebounds","Inside 50s","Clearances","Contested Possessions"]
stats_transformed = stats_df[stats_df['year'] >= 2018][stats_cols].copy()

# Rename the column headers
stats_transformed = stats_transformed.rename(columns={"gameId": "game_id",
                                                        "playerId": "player_id",
                                                        "Rebounds": "rebound",
                                                        "Clearances": "clearance",
                                                        "Inside 50s": "inside_50s",
                                                        "Contested Possessions": "contested_possessions",
                                                     })
# by loc function, to find the 'team_name' from team_transformed df and find its lenth of row and set the index as team_id
def get_team_id(x):
    team_rows = team_transformed.loc[team_transformed['team_name'] == x]
    if len(team_rows)>0:
        return team_rows.index.values[0]
    else:
        return ''
stats_transformed['team_id'] = stats_transformed['team'].map(get_team_id)

#set the performance criteria for each player adding sum of key criterias: rebounds, inside_50s, clearance, contested possessions
stats_transformed['performance'] = stats_transformed[["rebound","inside_50s","clearance","contested_possessions"]].sum(axis=1)

stats_transformed.drop('team', axis=1, inplace=True)
# Clean the data by dropping duplicates and setting the index
stats_transformed.drop_duplicates("game_id", inplace=True)
stats_transformed.set_index("game_id", inplace=True)

stats_transformed.tail()

In [None]:
#select the data from dataframe above 2018 as latest data, we skip data of 2021&2022 due to data incomplete in covid19 
game_transformed = game_df[game_df['year'] >= 2018]

# Rename the column headers
game_transformed = game_transformed.rename(columns={"gameId": "game_id",
                                                        "startTime": "start_time",
                                                        "homeTeamScore": "home_team_score",
                                                        "awayTeamScore": "away_team_score",
                                                        "rainfall": "rain_fall"
                                                     })
#use 'homeTeam' to map to connect with 'home_team_id'
game_transformed['home_team_id'] = game_transformed['homeTeam'].map(get_team_id)
game_transformed['away_team_id'] = game_transformed['awayTeam'].map(get_team_id)
game_transformed.drop('homeTeam', axis=1, inplace=True)
game_transformed.drop('awayTeam', axis=1, inplace=True)

#left merge the game_transformend df with the venue
game_transformed = pd.merge(game_transformed, venue, left_on='venue', right_on='name', how='left')
game_transformed.drop('venue', axis=1, inplace=True)
game_transformed.drop('start_year', axis=1, inplace=True)
game_transformed.drop('end_year', axis=1, inplace=True)
game_transformed.drop('in_use', axis=1, inplace=True)

#str type transfer to date type
game_transformed['date'] = pd.to_datetime(game_transformed['date'], format='%d-%b-%Y')
# str type transfered to 24-hr time type
game_transformed['start_time'] = pd.to_datetime(game_transformed['start_time']).dt.strftime('%H:%M')
# drop the duplicates and set index
game_transformed.drop_duplicates("game_id", inplace=True)
game_transformed.set_index("game_id", inplace=True)


game_transformed.head()

In [None]:
TVS_df =pd.read_csv(stats_file)
df1=TVS_df
df1_2=df1.query("year == 2020")

In [None]:
df2=df1_2.groupby(['team'])['Rebounds','Inside 50s','Clearances','Contested Possessions'].sum()

In [None]:
df3 = df2.sum(axis = 1)
df4=pd.DataFrame(df3)
df4.columns=['TVS']


TVS_transformed=df4.sort_values(['TVS'],ascending=False)
TVS_transformed

In [None]:
player_transformed.to_csv('output/player_transformed_result.csv',index=0)
stadium_transformed.to_csv('output/stadium_transformed_result.csv',index=0)
city_transformed.to_csv('output/city_transformed_result.csv',index=0)
team_transformed.to_csv('output/team_transformed_result.csv',index=0)
game_transformed.to_csv('output/game_transformed_result.csv',index=0)
stats_transformed.to_csv('output/stats_transformed_result.csv',index=0)
TVS_transformed.to_csv('output/stats_transformed_result.csv',index=0)

Create database connection

In [None]:
rsd_connection_string = "postgres:Claudia@localhost:5432/AFLGame_db"

In [None]:
engine = create_engine(f'postgresql://{rsd_connection_string}')

In [None]:
connection_string = "postgres:Claudia@localhost:5432/AFLGame_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
engine.table_names()

In [None]:
player_transformed

In [None]:
player_transformed.reset_index(inplace = True)

In [None]:
player_transformed

In [None]:
player_transformed.to_sql('player', engine)

In [None]:
stadium_transformed

In [None]:
stadium_transformed.reset_index(inplace = True)
stadium_transformed

In [None]:
stadium_transformed.to_sql('stadium', engine)

In [None]:
city_transformed.reset_index(inplace = True)
city_transformed

In [None]:
city_transformed.to_sql('city', engine)

In [None]:
team_transformed.reset_index(inplace = True)
team_transformed

In [None]:
team_transformed.to_sql('team', engine)

In [None]:
game_transformed.reset_index(inplace = True)
game_transformed

In [None]:
game_transformed.to_sql('game', engine)

In [None]:
stats_transformed.reset_index(inplace = True)
stats_transformed

In [None]:
stats_transformed.to_sql('stats', engine)