# Original ETL for Transfer Market API
## Author: Ayan Ashkenov
## Date: 12/05/2022
## Description:
Originally, the data warehouse was populated by data from 5 full seasons,
from 2017 to 2021. The data covered top 5 European leagues: English Premier League,
Spanish La Liga, Italian Serie A, German Bundesliga, and French Ligue 1.

The code below queries the Transfer Market API to form fct_transfers and its
dimension tables.

# Imports and settings

In [12]:
import pandas as pd
import numpy as np
import requests
import json
import time

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Global variables

In [2]:
league_ids = ['GB1', 'ES1', 'IT1', 'L1', 'FR1']
seasons = ['2021', '2020', '2019', '2018', '2017']

# Club name mapping to connect 2 APIs
### Names in Football API and Transfer Market API do not always match.

In [3]:
club_mappings: dict = {'1. FC Köln': 'FC Koln',
 '1.FC Nuremberg': 'FC Nurnberg',
 '1.FSV Mainz 05': 'FSV Mainz 05',
 'Alavés': 'Alaves',
 'Amiens SC': 'Amiens',
 'Angers SCO': 'Angers',
 'Arm. Bielefeld': 'Arminia Bielefeld',
 'Atalanta BC': 'Atalanta',
 'Athletic': 'Athletic Club',
 'Atlético Madrid': 'Atletico Madrid',
 'B. Leverkusen': 'Bayer Leverkusen',
 'Bor. Dortmund': 'Borussia Dortmund',
 "Bor. M'gladbach": 'Borussia Monchengladbach',
 'CA Osasuna': 'Osasuna',
 'CD Leganés': 'Leganes',
 'Cagliari Calcio': 'Cagliari',
 'Celta de Vigo': 'Celta Vigo',
 'Chievo Verona': 'Chievo',
 'Cádiz CF': 'Cadiz',
 'Dep. La Coruña': 'Deportivo La Coruna',
 'E. Frankfurt': 'Eintracht Frankfurt',
 'Elche CF': 'Elche',
 'F. Düsseldorf': 'Fortuna Dusseldorf',
 'FC Empoli': 'Empoli',
 'FC Lorient': 'Lorient',
 'FC Metz': 'Metz',
 'FC Nantes': 'Nantes',
 'G. Bordeaux': 'Bordeaux',
 'Greuther Fürth': 'SpVgg Greuther Furth',
 'Hellas Verona': 'Verona',
 'Hertha BSC': 'Hertha Berlin',
 'LOSC Lille': 'Lille',
 'Man City': 'Manchester City',
 'Man Utd': 'Manchester United',
 'Málaga CF': 'Malaga',
 'Nîmes Olympique': 'Nimes',
 'OGC Nice': 'Nice',
 'Olympique Lyon': 'Lyon',
 'Paris SG': 'Paris Saint Germain',
 'R. Strasbourg': 'Strasbourg',
 'RCD Mallorca': 'Mallorca',
 'Real Valladolid': 'Valladolid',
 'SC Paderborn': 'SC Paderborn 07',
 'SD Eibar': 'Eibar',
 'SD Huesca': 'Huesca',
 'SM Caen': 'Caen',
 'SPAL': 'Spal',
 'SSC Napoli': 'Napoli',
 'Saint-Étienne': 'Saint Etienne',
 'Sevilla FC': 'Sevilla',
 'Sheff Utd': 'Sheffield Utd',
 'Spezia Calcio': 'Spezia',
 'Spurs': 'Tottenham',
 'Stade Brestois': 'Stade Brestois 29',
 'Stade Reims': 'Reims',
 'TSG Hoffenheim': '1899 Hoffenheim',
 'Troyes': 'Estac Troyes',
 'UD Las Palmas': 'Las Palmas',
 'Udinese Calcio': 'Udinese',
 'VfL Bochum': 'VfL BOCHUM',
}

# with open("club_mappings.json", "w") as outfile:
#     json.dump(club_mappings, outfile)

# ---------------------------------------------------------------------------------------------

# dim_team

### Get dict with league data from Transfer Market API

In [None]:
def get_clubs_data(league_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/competitions/get-table"
    headers = {
        "X-RapidAPI-Key": "##############################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    container: dict = {}
    for year in seasons:
        clubs_data = pd.DataFrame()
        
        for league in league_ids:
            querystring = {"id": league, "seasonID": year, "domain":"com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            league_season_clubs = pd.json_normalize(response_json['table'])
            league_season_clubs['season'] = year
            
            if league == 'GB1':
                league_season_clubs['league'] = 'Premier League'
            elif league == 'ES1':
                league_season_clubs['league'] = 'La Liga'
            elif league == 'IT1':
                league_season_clubs['league'] = 'Serie A'
            elif league == 'L1':
                league_season_clubs['league'] = 'Bundesliga'
            else:
                league_season_clubs['league'] = 'Ligue 1'
            clubs_data = pd.concat(objs=[clubs_data, league_season_clubs])
            
        columns: list = ['id', 'clubName', 'rank', 'markDescription', 'season', 'league']
        clubs_data_cleaned: pd.DataFrame = clubs_data[columns]
        clubs_data_renamed: pd.DataFrame = clubs_data_cleaned.rename(
            columns={'id': 'club_id', 'clubName': 'club_name', 'rank': 'league_position', 'markDescription': f'{year}_status'}
        )
        clubs_data_final: pd.DataFrame = clubs_data_renamed.reset_index(drop=True)
        clubs_data_final.loc[clubs_data_final[f'{year}_status'] == '', f'{year}_status'] = 'UEFA Champions Legue'
        clubs_data_final.loc[clubs_data_final['league_position'] == 1, f'{year}_status'] = 'UEFA Champions Legue'
        container[year] = clubs_data_final
    return container

container: dict = get_clubs_data(league_ids=league_ids, seasons=seasons)

# container['2017'].to_csv(path_or_buf='2017_season_data.csv', index=False)
# container['2018'].to_csv(path_or_buf='2018_season_data.csv', index=False)
# container['2019'].to_csv(path_or_buf='2019_season_data.csv', index=False)
# container['2020'].to_csv(path_or_buf='2020_season_data.csv', index=False)
# container['2021'].to_csv(path_or_buf='2021_season_data.csv', index=False)

In [4]:
# To avoid API calls (which cost money and take some time to execute),
# I saved dataframes to csv files for easy access.
container: dict = {}
for season in seasons:
    file_path: str = season + '_season_data.csv'
    season_data: pd.DataFrame = pd.read_csv(filepath_or_buffer=file_path)
    container[season] = season_data

In [5]:
container['2017'].head()

Unnamed: 0,club_id,club_name,league_position,2017_status,season,league
0,281,Man City,1,UEFA Champions Legue,2017,Premier League
1,985,Man Utd,2,UEFA Champions Legue,2017,Premier League
2,148,Spurs,3,UEFA Champions Legue,2017,Premier League
3,31,Liverpool,4,UEFA Champions Legue,2017,Premier League
4,631,Chelsea,5,UEFA Europa League,2017,Premier League


# ---------------------------------------------------------------------------------------------

### Get unique clubs from Transfer Market API

In [5]:
def get_unique_clubs(league_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/competitions/get-table"
    headers = {
        "X-RapidAPI-Key": "#################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    clubs_data = pd.DataFrame()
    for year in seasons:
        for league in league_ids:
            querystring = {"id": league, "seasonID": year, "domain":"com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            league_season_clubs = pd.json_normalize(response_json['table'])
            
            if league == 'GB1':
                league_season_clubs['league'] = 'Premier League'
            elif league == 'ES1':
                league_season_clubs['league'] = 'La Liga'
            elif league == 'IT1':
                league_season_clubs['league'] = 'Serie A'
            elif league == 'L1':
                league_season_clubs['league'] = 'Bundesliga'
            else:
                league_season_clubs['league'] = 'Ligue 1'
                
            clubs_data = pd.concat(objs=[clubs_data, league_season_clubs])
    columns: list = [
        'id',
        'clubName',
        'league',
    ]
    clubs_data_prefinal: pd.DataFrame = clubs_data[columns].rename(columns={'league': 'league_name'})
    clubs_data_final: pd.DataFrame = clubs_data_prefinal.drop_duplicates(subset=['clubName'])
    return clubs_data_final

unique_clubs_data: pd.DataFrame = get_unique_clubs(league_ids=league_ids, seasons=seasons)

In [6]:
# unique_clubs_data.to_csv(path_or_buf='unique_clubs_from_transfers_api.csv', index=False)
unique_clubs_data: pd.DataFrame = pd.read_csv(filepath_or_buffer='unique_clubs_from_transfers_api.csv')
unique_clubs_data.head()

Unnamed: 0,id,clubName,league_name
0,281,Man City,Premier League
1,31,Liverpool,Premier League
2,631,Chelsea,Premier League
3,148,Spurs,Premier League
4,11,Arsenal,Premier League


In [7]:
for season in seasons:
    unique_clubs_data: pd.DataFrame = unique_clubs_data.merge(
        right=container[season], 
        left_on=['id'], 
        right_on=['club_id'], 
        how='left',
    )

  unique_clubs_data: pd.DataFrame = unique_clubs_data.merge(


In [8]:
unique_clubs_data.head()

Unnamed: 0,id,clubName,league_name,club_id_x,club_name_x,league_position_x,2021_status,season_x,league_x,club_id_y,club_name_y,league_position_y,2020_status,season_y,league_y,club_id_x.1,club_name_x.1,league_position_x.1,2019_status,season_x.1,league_x.1,club_id_y.1,club_name_y.1,league_position_y.1,2018_status,season_y.1,league_y.1,club_id,club_name,league_position,2017_status,season,league
0,281,Man City,Premier League,281.0,Man City,1.0,UEFA Champions Legue,2021.0,Premier League,281.0,Man City,1.0,UEFA Champions Legue,2020.0,Premier League,281.0,Man City,2.0,UEFA Champions Legue,2019.0,Premier League,281.0,Man City,1.0,UEFA Champions Legue,2018.0,Premier League,281.0,Man City,1.0,UEFA Champions Legue,2017.0,Premier League
1,31,Liverpool,Premier League,31.0,Liverpool,2.0,UEFA Champions Legue,2021.0,Premier League,31.0,Liverpool,3.0,UEFA Champions Legue,2020.0,Premier League,31.0,Liverpool,1.0,UEFA Champions Legue,2019.0,Premier League,31.0,Liverpool,2.0,UEFA Champions Legue,2018.0,Premier League,31.0,Liverpool,4.0,UEFA Champions Legue,2017.0,Premier League
2,631,Chelsea,Premier League,631.0,Chelsea,3.0,UEFA Champions Legue,2021.0,Premier League,631.0,Chelsea,4.0,UEFA Champions Legue,2020.0,Premier League,631.0,Chelsea,4.0,UEFA Champions Legue,2019.0,Premier League,631.0,Chelsea,3.0,UEFA Champions Legue,2018.0,Premier League,631.0,Chelsea,5.0,UEFA Europa League,2017.0,Premier League
3,148,Spurs,Premier League,148.0,Spurs,4.0,UEFA Champions Legue,2021.0,Premier League,148.0,Spurs,7.0,UEFA Europa Conference League Qualifikation,2020.0,Premier League,148.0,Spurs,6.0,UEFA Europa League,2019.0,Premier League,148.0,Spurs,4.0,UEFA Champions Legue,2018.0,Premier League,148.0,Spurs,3.0,UEFA Champions Legue,2017.0,Premier League
4,11,Arsenal,Premier League,11.0,Arsenal,5.0,UEFA Europa League,2021.0,Premier League,11.0,Arsenal,8.0,,2020.0,Premier League,11.0,Arsenal,8.0,,2019.0,Premier League,11.0,Arsenal,5.0,UEFA Europa League,2018.0,Premier League,11.0,Arsenal,6.0,UEFA Europa League,2017.0,Premier League


# dim_team
## SCD type 3

In [9]:
dim_team_columns: list = [
    'id', 
    'clubName', 
    'league_name', 
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
]
dim_team: pd.DataFrame = unique_clubs_data[dim_team_columns].rename(columns={'clubName': 'club_name'})

# with open('club_mappings.json') as json_file:
#     club_mappings = json.load(json_file)
    
dim_team['new_club_name'] = dim_team['club_name'].map(club_mappings).fillna(dim_team['club_name'])

In [10]:
columns_dim_team_part1: list = [
    'id',
    'new_club_name',
    'league_name', 
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
]
dim_team_part1: pd.DataFrame = dim_team[columns_dim_team_part1].rename(
    columns={'id': 'tm_api_club_id', 'new_club_name': 'club_name'}
)

In [19]:
# dim_team_part1.to_csv(path_or_buf='dim_team_part1.csv', index=False)
# dim_team_part1: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_part1.csv')
dim_team_og: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_og.csv')
dim_team_merged: pd.DataFrame = dim_team_og.merge(right=dim_team_part1, how='left', left_on='team_name', right_on='club_name')

dim_team_final_columns: list = [
    'team_key',
    'team_name',
    'league_name_x',
    'league_country',
    'team_logo_url',
    'tm_api_club_id',
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
]
dim_team_final: pd.DataFrame = dim_team_merged[dim_team_final_columns].rename(columns={'league_name_x': 'league_name'})

# dim_team_final['2022_status'] = ''
# dim_team_final['2023_status'] = ''
# dim_team_final['2024_status'] = ''
# dim_team_final['2025_status'] = ''
# dim_team_final['2026_status'] = ''
# dim_team_final['2027_status'] = ''
# dim_team_final['2028_status'] = ''
# dim_team_final['2029_status'] = ''
# dim_team_final['2030_status'] = ''

dim_team_final.head()

Unnamed: 0,team_key,team_name,league_name,league_country,team_logo_url,tm_api_club_id,2017_status,2018_status,2019_status,2020_status,2021_status
0,0,Brentford,Premier League,England,https://media.api-sports.io/football/teams/55.png,1148.0,,,,,
1,1,Burnley,Premier League,England,https://media.api-sports.io/football/teams/44.png,1132.0,Europa League second qualifying round,,,,Relegated
2,2,Chelsea,Premier League,England,https://media.api-sports.io/football/teams/49.png,631.0,UEFA Europa League,UEFA Champions Legue,UEFA Champions Legue,UEFA Champions Legue,UEFA Champions Legue
3,3,Everton,Premier League,England,https://media.api-sports.io/football/teams/45.png,29.0,,,,,
4,4,Leicester,Premier League,England,https://media.api-sports.io/football/teams/46.png,1003.0,,,UEFA Europa League,UEFA Europa League,


In [20]:
dim_team_final.shape

(138, 11)

In [21]:
dim_team_final.to_csv(path_or_buf='dim_team_final_og.csv', index=False)
# dim_team_final: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_final.csv')

# ---------------------------------------------------------------------------------------------

# Get transfers data for fct_transfers (arrivals and departures)

In [15]:
team_ids: list = unique_clubs_data['id'].tolist()

def get_transfers_arrivals_data(team_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/transfers/list-by-club"
    headers = {
        "X-RapidAPI-Key": "########################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    transfers_arrivals_data = pd.DataFrame()
    for year in seasons:
        time.sleep(2)
        for team in team_ids:
            querystring = {"id": team, "seasonID": year ,"domain": "com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            team_season_transfers_arrivals = pd.json_normalize(response_json['currentSeason']['transferArrivals'])
            team_season_transfers_arrivals['team_id'] = team
            team_season_transfers_arrivals['season'] = year
            transfers_arrivals_data = pd.concat(objs=[transfers_arrivals_data, team_season_transfers_arrivals])
    return transfers_arrivals_data

# transfers_arrivals_data = get_transfers_arrivals_data(team_ids=team_ids, seasons=seasons)
# transfers_arrivals_data.head()

In [23]:
# transfers_arrivals_data.to_csv(path_or_buf='transfers_arrivals_data.csv', index=False)
transfers_arrivals: pd.DataFrame = pd.read_csv(filepath_or_buffer='/Users/ayan/Desktop/BU/Fall 2022/CS689/transfers_arrivals_data.csv')

In [17]:
def get_transfers_departures_data(team_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/transfers/list-by-club"
    headers = {
        "X-RapidAPI-Key": "################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    transfers_departures_data = pd.DataFrame()
    for year in seasons:
        time.sleep(2)
        for team in team_ids:
            querystring = {"id": team, "seasonID": year ,"domain": "com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            team_season_transfers_departures = pd.json_normalize(response_json['currentSeason']['transferDepartures'])
            team_season_transfers_departures['team_id'] = team
            team_season_transfers_departures['season'] = year
            transfers_departures_data = pd.concat(objs=[transfers_departures_data, team_season_transfers_departures])
    return transfers_departures_data

#transfers_departures_data = get_transfers_departures_data(team_ids=team_ids, seasons=seasons)
#transfers_departures_data.head()

In [24]:
# transfers_departures_data.to_csv(path_or_buf='transfers_departures_data.csv', index=False)
transfers_departures: pd.DataFrame = pd.read_csv(filepath_or_buffer='/Users/ayan/Desktop/BU/Fall 2022/CS689/transfers_departures_data.csv')

# fct_transfers_raw

In [25]:
transfers_arrivals['transfer_type'] = 'arrival'
transfers_departures['transfer_type'] = 'departure'

concat_frames: list = [transfers_arrivals, transfers_departures]
fct_transfers_raw: pd.DataFrame = pd.concat(objs=concat_frames, ignore_index=True)

In [26]:
fct_transfers_raw.shape

(30699, 19)

# dim_transfer_type

In [17]:
dim_type_raw_columns: list = ['transfer_type']
dim_type_raw: pd.DataFrame = fct_transfers_raw[dim_type_raw_columns]
dim_type_no_dup: pd.DataFrame = dim_type_raw.drop_duplicates(subset=['transfer_type']).reset_index(drop=True)
dim_type_no_dup['transfer_type_key'] = dim_type_no_dup.index
dim_type_columns: list = ['transfer_type_key', 'transfer_type']
dim_type: pd.DataFrame = dim_type_no_dup[dim_type_columns]

In [None]:
# dim_type.to_csv(path_or_buf='dim_transfer_type.csv', index=False)
dim_type: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_transfer_type.csv')

In [18]:
dim_type.head()

Unnamed: 0,transfer_type_key,transfer_type
0,0,arrival
1,1,departure


# dim_position

In [19]:
dim_position_raw_columns: list = ['positionsdetail']
dim_position_raw: pd.DataFrame = fct_transfers_raw[dim_position_raw_columns]
dim_position_no_dup: pd.DataFrame = dim_position_raw.drop_duplicates(subset=['positionsdetail']).reset_index(drop=True).rename(columns={'positionsdetail': 'position'})
dim_position_no_dup['position_key'] = dim_position_no_dup.index + 10
dim_position_columns: list = ['position_key', 'position']
dim_position: pd.DataFrame = dim_position_no_dup[dim_position_columns]

In [None]:
# dim_position.to_csv(path_or_buf='dim_position.csv', index=False)
dim_position: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_position.csv')

In [20]:
dim_position.head()

Unnamed: 0,position_key,position
0,10,Centre-Back
1,11,Centre-Forward
2,12,Right Winger
3,13,Attacking Midfield
4,14,Left Winger


In [21]:
dim_position.shape

(14, 2)

# dim_details

In [29]:
dim_details_raw_columns: list = ['playerName', 'clubName', 'transfer_type', 'team_id']
dim_details_raw: pd.DataFrame = fct_transfers_raw[dim_details_raw_columns]
dim_details_no_dup: pd.DataFrame = dim_details_raw.drop_duplicates(subset=['playerName']).reset_index(drop=True).rename(columns={'playerName': 'player_name', 'clubName': 'to/from_club_name', 'team_id': 'tm_api_club_id_origin'})
dim_details_no_dup['details_key'] = dim_details_no_dup.index + 500
dim_details_columns: list = ['details_key', 'player_name', 'to/from_club_name', 'transfer_type', 'tm_api_club_id_origin']
dim_details: pd.DataFrame = dim_details_no_dup[dim_details_columns]

In [None]:
# dim_details.to_csv(path_or_buf='dim_details.csv', index=False)
dim_details: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_details.csv')

In [30]:
dim_details.head()

Unnamed: 0,details_key,player_name,to/from_club_name,transfer_type,tm_api_club_id_origin
0,500,E. Palmer-Brown,Troyes,arrival,281
1,501,J. Álvarez,River Plate,arrival,281
2,502,P. Sandler,Troyes,arrival,281
3,503,Patrick Roberts,Troyes,arrival,281
4,504,Luka Ilic,Twente FC,arrival,281


In [31]:
dim_details.shape

(8299, 5)

# dim_transfer_date

In [40]:
dim_transfer_date_raw_columns: list = ['date']
dim_transfer_date_raw: pd.DataFrame = fct_transfers_raw[dim_transfer_date_raw_columns]
dim_transfer_date_no_dup: pd.DataFrame = dim_transfer_date_raw.drop_duplicates(subset=['date']).reset_index(drop=True)
dim_transfer_date_no_dup['transfer_date_key'] = dim_transfer_date_no_dup.index + 1_000

In [41]:
dim_transfer_date_no_dup['datetime'] = pd.to_datetime(arg=dim_transfer_date_no_dup['date'], format='%b %d, %Y', errors='coerce')
dim_transfer_date_no_dup['year'] = dim_transfer_date_no_dup['datetime'].dt.year
dim_transfer_date_no_dup['month'] = dim_transfer_date_no_dup['datetime'].dt.month
dim_transfer_date_no_dup['day'] = dim_transfer_date_no_dup['datetime'].dt.day

In [42]:
dim_transfer_date_columns: list = [
    'transfer_date_key',
    'date',
    'datetime',
    'year',
    'month',
    'day',
]
dim_transfer_date: pd.DataFrame = dim_transfer_date_no_dup[dim_transfer_date_columns]

In [None]:
# dim_transfer_date.to_csv(path_or_buf='dim_transfer_date.csv', index=False)
dim_transfer_date: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_transfer_date.csv')

In [43]:
dim_transfer_date.shape

(1020, 6)

# ---------------------------------------------------------------------------------------------

# fct_transfers

In [36]:
fct_transfers_raw['team_id'] = fct_transfers_raw['team_id'].apply(str)

fct_dim_transfer_type_merge: pd.DataFrame = fct_transfers_raw.merge(right=dim_type, how='left', left_on='transfer_type', right_on='transfer_type')
fct_dim_position_merge: pd.DataFrame = fct_dim_transfer_type_merge.merge(right=dim_position, how='left', left_on='positionsdetail', right_on='position')
fct_dim_details_merge: pd.DataFrame = fct_dim_position_merge.merge(right=dim_details, how='left', left_on='playerName', right_on='player_name')
fct_dim_transfer_date_merge: pd.DataFrame = fct_dim_details_merge.merge(right=dim_transfer_date, how='left', left_on='date', right_on='date')
fct_dim_team_merge: pd.DataFrame = fct_dim_transfer_date_merge.merge(right=dim_team_final, how='left', left_on='team_id', right_on='tm_api_club_id')

fct_dim_team_merge['transfer_key'] = fct_dim_team_merge.index + 10_000

In [37]:
fct_transfers_columns: list = [
    'transfer_key',
    'transfer_type_key',
    'position_key',
    'details_key',
    'transfer_date_key',
    'team_key',
    'transferFeeUnformatted',
]
fct_transfers: pd.DataFrame = fct_dim_team_merge[fct_transfers_columns].rename(
    columns={'dim_team_id': 'team_key', 'transferFeeUnformatted': 'transfer_fee'}
)

In [38]:
fct_transfers.head()

Unnamed: 0,transfer_key,transfer_type_key,position_key,details_key,transfer_date_key,team_key,transfer_fee
0,10000,0,10,500,1000,216,0
1,10001,0,11,501,1001,216,21400000
2,10002,0,10,502,1002,216,0
3,10003,0,12,503,1003,216,0
4,10004,0,13,504,1004,216,0


In [39]:
fct_transfers.shape

(30699, 7)

# ---------------------------------------------------------------------------------------------

# Load to PostgreSQL 

In [33]:
from sqlalchemy import create_engine

In [45]:
engine = create_engine('postgresql://postgres:##############@localhost:5432/cs689_term_project')

dim_type.to_sql(name='dim_transfer_type', con=engine, index=False, if_exists='replace')
dim_position.to_sql(name='dim_position', con=engine, index=False, if_exists='replace')
dim_details.to_sql(name='dim_details', con=engine, index=False, if_exists='replace')
dim_transfer_date.to_sql(name='dim_transfer_date', con=engine, index=False, if_exists='replace')
fct_transfers.to_sql(name='fct_transfers', con=engine, index=False, if_exists='replace')

20

In [44]:
import psycopg2

conn = psycopg2.connect(
    database="cs689_term_project",
    user="postgres",
    password="#########################",
    host="localhost",
    port="5432"
)

cursor = conn.cursor()

# sql1 = '''DROP TABLE IF EXISTS dim_details CASCADE'''
# cursor.execute(sql1)

sql2 = '''DROP TABLE IF EXISTS dim_transfer_date CASCADE'''
cursor.execute(sql2)

# sql3 = '''DROP TABLE IF EXISTS dim_referee CASCADE'''
# cursor.execute(sql3)

# sql4 = '''DROP TABLE IF EXISTS dim_team CASCADE'''
# cursor.execute(sql4)

# sql5 = '''DROP TABLE IF EXISTS fct_fixtures CASCADE'''
# cursor.execute(sql5)

conn.commit()

conn.close()