# Original ETL for FOOTBALL API
## Author: Ayan Ashkenov
## Date: 12/05/2022
## Description:
Originally, the data warehouse was populated by data from 5 full seasons,  
from 2017 to 2021. The data covered top 5 European leagues: English Premier League,  
Spanish La Liga, Italian Serie A, German Bundesliga, and French Ligue 1.  
  
The code below queries the FOOTBALL API to form fct_fixtures and its dimension  
tables.  
  
# Imports and settings
Necessary packages and options for displaying data

In [1]:
import pandas as pd
import numpy as np
import requests
import json

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Global variables
league_ids list and countries list are in order, meaning that  
league id 39 corresponds to country England and so on

In [2]:
league_ids: list = ['39', '140', '135', '78', '61']
seasons: list = ['2021', '2020', '2019', '2018', '2017']
countries: list = ['England', 'Spain', 'Italy', 'Germany', 'France']

# Get fixture data function
### Needed for fct and dim tables

In [3]:
# This function makes API calls to FOOTBALL API to get fixtures data.
# A single API call retrieves fixture data for 1 league and 1 year/season only.
# This is why a nested for loop is used to cover all 5 leagues and years/seasons.
def get_fixture_data(league_ids: list, seasons: list):
    url_fixtures = "https://api-football-v1.p.rapidapi.com/v3/fixtures"
    fixtures_headers = {
        "X-RapidAPI-Key": "##############################",
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }
    fixtures_data = pd.DataFrame()
    for league in league_ids:
        for year in seasons:
            fixtures_querystring = {"league": league, "season": year}
            fixtures_response = requests.request(
                method="GET", 
                url=url_fixtures, 
                headers=fixtures_headers, 
                params=fixtures_querystring,
            )
            fixtures_response_json = json.loads(fixtures_response.text)
            league_season_fixtures = pd.json_normalize(fixtures_response_json['response'])
            fixtures_data = pd.concat(objs=[fixtures_data, league_season_fixtures])
    return fixtures_data

fixtures_raw: pd.DataFrame = get_fixture_data(league_ids=league_ids, seasons=seasons)
# fixtures_raw: pd.DataFrame = pd.read_csv(filepath_or_buffer='fixture_data.csv')

In [4]:
# Column fixture.referee holds referee names for each fixture (a row is a fixture).
# However, the format differs depending on the year of the data. Sometimes a referee could
# be recorded as M. Oliver, but other times he is recorded as Michael Oliver, England.
# The code below brings this column to one universal standard, which is:
#[first name initial][period][space][last name]
# Ex. M. Oliver
fixtures_raw['referee'] = np.where(
    fixtures_raw['fixture.referee'].str.contains(
        pat=',',
    ), 
    fixtures_raw['fixture.referee'].str.split(',').str[0].str.split(' ').str[0].str[0] + '. ' + fixtures_raw['fixture.referee'].str.split(',').str[0].str.split(' ').str[1],
    fixtures_raw['fixture.referee'],
)

In [5]:
fixtures_raw.head()

Unnamed: 0,fixture.id,fixture.referee,fixture.timezone,fixture.date,fixture.timestamp,fixture.periods.first,fixture.periods.second,fixture.venue.id,fixture.venue.name,fixture.venue.city,fixture.status.long,fixture.status.short,fixture.status.elapsed,league.id,league.name,league.country,league.logo,league.flag,league.season,league.round,teams.home.id,teams.home.name,teams.home.logo,teams.home.winner,teams.away.id,teams.away.name,teams.away.logo,teams.away.winner,goals.home,goals.away,score.halftime.home,score.halftime.away,score.fulltime.home,score.fulltime.away,score.extratime.home,score.extratime.away,score.penalty.home,score.penalty.away,referee
0,710556,M. Oliver,UTC,2021-08-13T19:00:00+00:00,1628881200,1628881000.0,1628885000.0,10503.0,Brentford Community Stadium,"Brentford, Middlesex",Match Finished,FT,90.0,39,Premier League,England,https://media.api-sports.io/football/leagues/3...,https://media.api-sports.io/flags/gb.svg,2021,Regular Season - 1,55,Brentford,https://media.api-sports.io/football/teams/55.png,True,42,Arsenal,https://media.api-sports.io/football/teams/42.png,False,2.0,0.0,1.0,0.0,2.0,0.0,,,,,M. Oliver
1,710557,D. Coote,UTC,2021-08-14T14:00:00+00:00,1628949600,1628950000.0,1628953000.0,512.0,Turf Moor,Burnley,Match Finished,FT,90.0,39,Premier League,England,https://media.api-sports.io/football/leagues/3...,https://media.api-sports.io/flags/gb.svg,2021,Regular Season - 1,44,Burnley,https://media.api-sports.io/football/teams/44.png,False,51,Brighton,https://media.api-sports.io/football/teams/51.png,True,1.0,2.0,1.0,0.0,1.0,2.0,,,,,D. Coote
2,710558,J. Moss,UTC,2021-08-14T14:00:00+00:00,1628949600,1628950000.0,1628953000.0,519.0,Stamford Bridge,London,Match Finished,FT,90.0,39,Premier League,England,https://media.api-sports.io/football/leagues/3...,https://media.api-sports.io/flags/gb.svg,2021,Regular Season - 1,49,Chelsea,https://media.api-sports.io/football/teams/49.png,True,52,Crystal Palace,https://media.api-sports.io/football/teams/52.png,False,3.0,0.0,2.0,0.0,3.0,0.0,,,,,J. Moss
3,710559,A. Madley,UTC,2021-08-14T14:00:00+00:00,1628949600,1628950000.0,1628953000.0,8560.0,Goodison Park,Liverpool,Match Finished,FT,90.0,39,Premier League,England,https://media.api-sports.io/football/leagues/3...,https://media.api-sports.io/flags/gb.svg,2021,Regular Season - 1,45,Everton,https://media.api-sports.io/football/teams/45.png,True,41,Southampton,https://media.api-sports.io/football/teams/41.png,False,3.0,1.0,0.0,1.0,3.0,1.0,,,,,A. Madley
4,710560,C. Pawson,UTC,2021-08-14T14:00:00+00:00,1628949600,1628950000.0,1628953000.0,547.0,King Power Stadium,"Leicester, Leicestershire",Match Finished,FT,90.0,39,Premier League,England,https://media.api-sports.io/football/leagues/3...,https://media.api-sports.io/flags/gb.svg,2021,Regular Season - 1,46,Leicester,https://media.api-sports.io/football/teams/46.png,True,39,Wolves,https://media.api-sports.io/football/teams/39.png,False,1.0,0.0,1.0,0.0,1.0,0.0,,,,,C. Pawson


In [57]:
fixtures_raw.to_sql(name='fixtures_raw_og', con=engine, index=False, if_exists='replace')

142

# Get venue data function
### Needed for dim_location table

In [5]:
# This functino also makes calls to the FOOTBALL API, but to a different endpoint.
# It retrieves venue/location/stadium data.
# A single API call gets venue data for a single country (all stadiums in a country).
def get_venue_data(countries: list):
    url = "https://api-football-v1.p.rapidapi.com/v3/venues"
    headers = {
        "X-RapidAPI-Key": "#######################################",
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }
    venues_data = pd.DataFrame()
    for country in countries:
        querystring = {"country": country}
        response = requests.request(
            method="GET", 
            url=url, 
            headers=headers, 
            params=querystring,
        )
        response_json = json.loads(response.text)
        country_venues = pd.json_normalize(response_json['response'])
        venues_data = pd.concat(objs=[venues_data, country_venues])
    return venues_data

venues_raw: pd.DataFrame = get_venue_data(countries=countries)
# venues_raw: pd.DataFrame = pd.read_csv(filepath_or_buffer='venues_data.csv')

In [6]:
venues_raw.head()

Unnamed: 0,id,name,address,city,country,capacity,surface,image
0,489,Wembley Stadium,"Stadium Way, Wembley, Brent",London,England,90000,grass,https://media.api-sports.io/football/venues/48...
1,556,Old Trafford,Sir Matt Busby Way,Manchester,England,76212,grass,https://media.api-sports.io/football/venues/55...
2,562,St. James' Park,St. James&apos; Street,Newcastle upon Tyne,England,52389,grass,https://media.api-sports.io/football/venues/56...
3,504,Vitality Stadium,"Dean Court, Kings Park","Bournemouth, Dorset",England,12000,grass,https://media.api-sports.io/football/venues/50...
4,535,Craven Cottage,Stevenage Road,London,England,25700,grass,https://media.api-sports.io/football/venues/53...


# -----------------------------------------------------------------------------

# dim_location
## SCD type 2

In [7]:
dim_location_raw_columns: list = [
    'fixture.venue.id',
    'league.country',
    'fixture.venue.city',
    'fixture.venue.name',
    'teams.home.name',
    'fixture.date',
]
dim_location_raw: pd.DataFrame = fixtures_raw[dim_location_raw_columns]

dim_location_grouped: pd.DataFrame = dim_location_raw.groupby(by=['fixture.venue.name']).agg({'fixture.date': [np.min,np.max]}).reset_index()
dim_location_grouped.columns = dim_location_grouped.columns.get_level_values(1)
dim_location_min_max_date: pd.DataFrame = dim_location_grouped.rename(columns={'': 'venue_name', 'amin': 'start_date', 'amax': 'end_date'})

dim_location_date_raw_merge: pd.DataFrame = dim_location_min_max_date.merge(
    right=dim_location_raw.drop_duplicates(subset=['fixture.venue.name'], keep='first'), 
    left_on='venue_name', 
    right_on='fixture.venue.name', 
    how='left',
)
dim_location_date_raw_merge.head()

Unnamed: 0,venue_name,start_date,end_date,fixture.venue.id,league.country,fixture.venue.city,fixture.venue.name,teams.home.name,fixture.date
0,Abanca-Balaídos,2019-08-17T15:00:00+00:00,2022-05-15T17:30:00+00:00,1467.0,Spain,Vigo,Abanca-Balaídos,Celta Vigo,2021-08-15T15:30:00+00:00
1,Allianz Arena,2017-08-18T18:30:00+00:00,2022-05-08T15:30:00+00:00,700.0,Germany,München,Allianz Arena,Bayern Munich,2021-08-22T15:30:00+00:00
2,Allianz Arena (München),2019-05-04T13:30:00+00:00,2019-05-04T13:30:00+00:00,,Germany,,Allianz Arena (München),Bayern Munich,2019-05-04T13:30:00+00:00
3,Allianz Riviera,2017-08-11T17:00:00+00:00,2022-05-14T19:00:00+00:00,663.0,France,Nice,Allianz Riviera,Nice,2021-08-08T13:00:00+00:00
4,Allianz Stadium,2017-08-19T16:00:00+00:00,2022-05-16T18:45:00+00:00,909.0,Italy,Torino,Allianz Stadium,Juventus,2021-08-28T18:45:00+00:00


In [8]:
dim_location_date_raw_venues_merge: pd.DataFrame = dim_location_date_raw_merge.merge(
    right=venues_raw, how='left', left_on='fixture.venue.id', right_on='id',
)
# When a venue becomes out of date, its ID is dropped in the API.
# Using that, it is possible to filter out what venues are not current.
dim_location_date_raw_venues_merge['is_current'] = np.where(
    dim_location_date_raw_venues_merge['id'].isna(), False, True,
)
dim_location_date_raw_venues_merge['end_date'] = np.where(
    dim_location_date_raw_venues_merge['is_current'] == True, '9999-01-01T01:00:00+00:00', dim_location_date_raw_venues_merge.end_date,
)

dim_location_columns: list = [
    'league.country',
    'fixture.venue.city',
    'address',
    'venue_name',
    'teams.home.name',
    'capacity',
    'surface',
    'start_date',
    'end_date',
    'is_current',
]
dim_location_cleaned: pd.DataFrame = dim_location_date_raw_venues_merge[dim_location_columns]
dim_location_prefinal: pd.DataFrame = dim_location_cleaned.rename(
    columns={
        'league.country': 'country',
        'fixture.venue.city': 'city',
        'teams.home.name': 'team_name',
    },
)
dim_location: pd.DataFrame = dim_location_prefinal.reset_index().rename(columns={'index': 'location_key'})

# SCD type 2 requires 2 separate keys/ids.
# The first is the dimension key, which is called location_key in this design.
# The second is venue_id, which identifies the venue (unlike the location key,
# it is not unique and can be repeated)
# Since non-current venues do not have an id, team names are used to assign those
# records the same ids as their current equivalents.
team_names: np.ndarray = fixtures_raw['teams.home.name'].unique()
team_encoding: pd.DataFrame = pd.DataFrame(data={'team_name': team_names})
team_encoding['venue_id'] = team_encoding.index + 100

dim_location: pd.DataFrame = dim_location.merge(right=team_encoding, how='left', left_on='team_name', right_on='team_name')

In [9]:
dim_location.head()

Unnamed: 0,location_key,country,city,address,venue_name,team_name,capacity,surface,start_date,end_date,is_current,venue_id
0,0,Spain,Vigo,Avenida de Balaídos,Abanca-Balaídos,Celta Vigo,31800.0,grass,2019-08-17T15:00:00+00:00,9999-01-01T01:00:00+00:00,True,130
1,1,Germany,München,Werner-Heisenberg-Allee 25,Allianz Arena,Bayern Munich,75000.0,grass,2017-08-18T18:30:00+00:00,9999-01-01T01:00:00+00:00,True,193
2,2,Germany,,,Allianz Arena (München),Bayern Munich,,,2019-05-04T13:30:00+00:00,2019-05-04T13:30:00+00:00,False,193
3,3,France,Nice,Boulevard des Jardiniers,Allianz Riviera,Nice,35624.0,grass,2017-08-11T17:00:00+00:00,9999-01-01T01:00:00+00:00,True,216
4,4,Italy,Torino,Strada Comunale di Altessano 131,Allianz Stadium,Juventus,45666.0,grass,2017-08-19T16:00:00+00:00,9999-01-01T01:00:00+00:00,True,169


In [10]:
dim_location.shape

(196, 12)

In [11]:
# Saving dim_location table to a .csv file to avoid API calls
dim_location.to_csv(path_or_buf='dim_location_og.csv', index=False)
# dim_location: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_location_og.csv')

# -----------------------------------------------------------------------------

# dim_date

In [70]:
# Ignore warnings.
# They do not impact the code in any way.
dim_date_raw_columns: list = ['fixture.date', 'fixture.timezone']
dim_date_raw: pd.DataFrame = fixtures_raw[dim_date_raw_columns]

dim_date_raw['datetime'] = pd.to_datetime(dim_date_raw['fixture.date'])
dim_date_raw['year'] = dim_date_raw['datetime'].dt.year
dim_date_raw['month'] = dim_date_raw['datetime'].dt.month
dim_date_raw['day'] = dim_date_raw['datetime'].dt.day
dim_date_raw['hour'] = dim_date_raw['datetime'].dt.hour
dim_date_raw['minute'] = dim_date_raw['datetime'].dt.minute
dim_date_raw['second'] = dim_date_raw['datetime'].dt.second
dim_date_raw['date'] = dim_date_raw['datetime'].dt.date
dim_date_raw['time'] = dim_date_raw['datetime'].dt.time

dim_date_prefinal: pd.DataFrame = dim_date_raw.drop_duplicates().reset_index(drop=True).rename(
    columns={
        'fixture.date': 'og_datetime', 
        'fixture.timezone': 'timezone',
    },
)
dim_date_prefinal['date_key'] = dim_date_prefinal.index

dim_date_columns: list = [
    'date_key',
    'og_datetime',
    'datetime',
    'year',
    'month',
    'day',
    'hour',
    'minute',
    'second',
    'date',
    'time',
    'timezone',
]
dim_date: pd.DataFrame = dim_date_prefinal[dim_date_columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dim_date_raw['datetime'] = pd.to_datetime(dim_date_raw['fixture.date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dim_date_raw['year'] = dim_date_raw['datetime'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dim_date_raw['month'] = dim_date_raw['datetime'].dt.month
A value is trying t

In [71]:
dim_date.head()

Unnamed: 0,date_key,og_datetime,datetime,year,month,day,hour,minute,second,date,time,timezone
0,0,2021-08-13T19:00:00+00:00,2021-08-13 19:00:00+00:00,2021,8,13,19,0,0,2021-08-13,19:00:00,UTC
1,1,2021-08-14T14:00:00+00:00,2021-08-14 14:00:00+00:00,2021,8,14,14,0,0,2021-08-14,14:00:00,UTC
2,2,2021-08-14T11:30:00+00:00,2021-08-14 11:30:00+00:00,2021,8,14,11,30,0,2021-08-14,11:30:00,UTC
3,3,2021-08-15T13:00:00+00:00,2021-08-15 13:00:00+00:00,2021,8,15,13,0,0,2021-08-15,13:00:00,UTC
4,4,2021-08-14T16:30:00+00:00,2021-08-14 16:30:00+00:00,2021,8,14,16,30,0,2021-08-14,16:30:00,UTC


In [72]:
dim_date.shape

(5145, 12)

In [15]:
# Saving dim_date table to a .csv file to avoid API calls
dim_date.to_csv(path_or_buf='dim_date_og.csv', index=False)
# dim_date: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_date_og.csv')

# -----------------------------------------------------------------------------

# dim_referee

In [16]:
# The string formatting was performed earlier, making this step very straightforward.
dim_referee_raw_columns: list = ['referee']
dim_referee_raw: pd.DataFrame = fixtures_raw[dim_referee_raw_columns]
dim_referee_no_dup: pd.DataFrame = dim_referee_raw.drop_duplicates(subset=['referee']).reset_index(drop=True)
dim_referee_no_dup['referee_key'] = dim_referee_no_dup.index
dim_referee_columns: list = ['referee_key', 'referee']
dim_referee: pd.DataFrame = dim_referee_no_dup[dim_referee_columns]

In [17]:
dim_referee.head()

Unnamed: 0,referee_key,referee
0,0,M. Oliver
1,1,D. Coote
2,2,J. Moss
3,3,A. Madley
4,4,C. Pawson


In [18]:
dim_referee.shape

(223, 2)

In [19]:
# Saving dim_referee table to a .csv file to avoid API calls
dim_referee.to_csv(path_or_buf='dim_referee_og.csv', index=False)
# dim_referee: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_referee_og.csv')

# -----------------------------------------------------------------------------

# dim_team
## SCD type 3
### Note!
dim_team is shared by the 2 fact tables.  
In other words, it requires input from the FOOTBALL API and the Transfer Market API.  
The initial dim_team (raw version) is made in this notebook. The result is then saved  
to a .csv file and passed on to the other ETL notebook.  
Once the table is completed there, it is saved to another .csv file and passed on back  
to this notebook.

In [20]:
dim_team_og_columns: list = ['teams.home.name', 'league.name', 'league.country', 'teams.home.logo']
dim_team_og_raw: pd.DataFrame = fixtures_raw[dim_team_og_columns]
dim_team_og_no_dup: pd.DataFrame = dim_team_og_raw.drop_duplicates(subset=['teams.home.name']).reset_index(drop=True)
dim_team_og_no_dup['team_key'] = dim_team_og_no_dup.index

dim_team_og: pd.DataFrame = dim_team_og_no_dup.rename(columns={
    'teams.home.name': 'team_name',
    'league.name': 'league_name',
    'league.country': 'league_country',
    'teams.home.logo': 'team_logo_url',
})

In [21]:
dim_team_og.head()

Unnamed: 0,team_name,league_name,league_country,team_logo_url,team_key
0,Brentford,Premier League,England,https://media.api-sports.io/football/teams/55.png,0
1,Burnley,Premier League,England,https://media.api-sports.io/football/teams/44.png,1
2,Chelsea,Premier League,England,https://media.api-sports.io/football/teams/49.png,2
3,Everton,Premier League,England,https://media.api-sports.io/football/teams/45.png,3
4,Leicester,Premier League,England,https://media.api-sports.io/football/teams/46.png,4


In [22]:
dim_team_og.shape

(138, 5)

In [32]:
# # Saving dim_team_og table to a .csv file to use in the other ETL notebook
dim_team_og.to_csv(path_or_buf='dim_team_og.csv', index=False)
# dim_team_final: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_final.csv')

In [49]:
dim_team_final: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_final_og.csv')

In [50]:
dim_team_final.head()

Unnamed: 0,team_key,team_name,league_name,league_country,team_logo_url,tm_api_club_id,2017_status,2018_status,2019_status,2020_status,2021_status
0,0,Brentford,Premier League,England,https://media.api-sports.io/football/teams/55.png,1148.0,,,,,
1,1,Burnley,Premier League,England,https://media.api-sports.io/football/teams/44.png,1132.0,Europa League second qualifying round,,,,Relegated
2,2,Chelsea,Premier League,England,https://media.api-sports.io/football/teams/49.png,631.0,UEFA Europa League,UEFA Champions Legue,UEFA Champions Legue,UEFA Champions Legue,UEFA Champions Legue
3,3,Everton,Premier League,England,https://media.api-sports.io/football/teams/45.png,29.0,,,,,
4,4,Leicester,Premier League,England,https://media.api-sports.io/football/teams/46.png,1003.0,,,UEFA Europa League,UEFA Europa League,


# ---------------------------------------------------------------------------------------------

# fct_fixtures

In [36]:
fct_dim_location_merge: pd.DataFrame = fixtures_raw.merge(right=dim_location, how='left', left_on='fixture.venue.name', right_on='venue_name')
fct_dim_date_merge: pd.DataFrame = fct_dim_location_merge.merge(right=dim_date, how='left', left_on='fixture.date', right_on='og_datetime')
fct_dim_referee_merge: pd.DataFrame = fct_dim_date_merge.merge(right=dim_referee, how='left', left_on='fixture.referee', right_on='referee')
fct_dim_team_merge: pd.DataFrame = fct_dim_referee_merge.merge(right=dim_team_final, how='left', left_on='teams.home.name', right_on='team_name')
fct_dim_team_merge_2: pd.DataFrame = fct_dim_team_merge.merge(right=dim_team_final, how='left', left_on='teams.away.name', right_on='team_name')

In [37]:
fct_dim_team_merge_2['fixture_key'] = fct_dim_team_merge_2.index

fct_fixtures_columns: list = [
    'fixture_key',
    'location_key',
    'date_key',
    'referee_key',
    'team_key_x',
    'team_key_y',
    'goals.home',
    'goals.away',
    'teams.home.winner',
    'teams.away.winner',
]

fct_fixtures: pd.DataFrame = fct_dim_team_merge_2[fct_fixtures_columns].rename(
    columns={
        'team_key_x': 'home_team_key',
        'team_key_y': 'away_team_key',
        'goals.home': 'home_goals', 
        'goals.away': 'away_goals', 
        'teams.home.winner': 'home_team_win', 
        'teams.away.winner': 'away_team_win',
    }
)
fct_fixtures['total_goals'] = fct_fixtures['home_goals'] + fct_fixtures['away_goals']

In [38]:
fct_fixtures.head()

Unnamed: 0,fixture_key,location_key,date_key,referee_key,home_team_key,away_team_key,home_goals,away_goals,home_team_win,away_team_win,total_goals
0,0,14.0,0,0.0,0,10,2.0,0.0,True,False,2.0
1,1,178.0,1,1.0,1,12,1.0,2.0,False,True,3.0
2,2,174.0,1,2.0,2,13,3.0,0.0,True,False,3.0
3,3,57.0,1,3.0,3,17,3.0,1.0,True,False,4.0
4,4,67.0,1,4.0,4,19,1.0,0.0,True,False,1.0


In [39]:
fct_fixtures.shape

(9142, 11)

# Load to PostgreSQL

In [73]:
import psycopg2

conn = psycopg2.connect(
    database="cs689_term_project",
    user="postgres",
    password="#######################",
    host="localhost",
    port="5432"
)

cursor = conn.cursor()

sql1 = '''DROP TABLE IF EXISTS dim_location CASCADE'''
cursor.execute(sql1)

sql2 = '''DROP TABLE IF EXISTS dim_date CASCADE'''
cursor.execute(sql2)

sql3 = '''DROP TABLE IF EXISTS dim_referee CASCADE'''
cursor.execute(sql3)

sql4 = '''DROP TABLE IF EXISTS dim_team CASCADE'''
cursor.execute(sql4)

sql5 = '''DROP TABLE IF EXISTS fct_fixtures CASCADE'''
cursor.execute(sql5)

conn.commit()

conn.close()

In [30]:
from sqlalchemy import create_engine

In [74]:
engine = create_engine('postgresql://postgres:############@localhost:5432/cs689_term_project')

dim_location.to_sql(name='dim_location', con=engine, index=False, if_exists='replace')
dim_date.to_sql(name='dim_date', con=engine, index=False, if_exists='replace')
dim_referee.to_sql(name='dim_referee', con=engine, index=False, if_exists='replace')
dim_team_final.rename(columns={'dim_team_id': 'team_key'}).to_sql(name='dim_team', con=engine, index=False, if_exists='replace')
fct_fixtures.to_sql(name='fct_fixtures', con=engine, index=False, if_exists='replace')

145