# Imports and General Settings

In [32]:
import pandas as pd
import numpy as np
import requests
import json
import time
from sqlalchemy import create_engine

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Variables for API Calls

In [4]:
league_ids = ['GB1', 'ES1', 'IT1', 'L1', 'FR1']
seasons = ['2022', '2021', '2020', '2019', '2018', '2017']

# Club Names Mapping

In [3]:
club_mappings: dict = {'1. FC Köln': 'FC Koln',
 '1.FC Nuremberg': 'FC Nurnberg',
 '1.FSV Mainz 05': 'FSV Mainz 05',
 'Alavés': 'Alaves',
 'Amiens SC': 'Amiens',
 'Angers SCO': 'Angers',
 'Arm. Bielefeld': 'Arminia Bielefeld',
 'Atalanta BC': 'Atalanta',
 'Athletic': 'Athletic Club',
 'Atlético Madrid': 'Atletico Madrid',
 'B. Leverkusen': 'Bayer Leverkusen',
 'Bor. Dortmund': 'Borussia Dortmund',
 "Bor. M'gladbach": 'Borussia Monchengladbach',
 'CA Osasuna': 'Osasuna',
 'CD Leganés': 'Leganes',
 'Cagliari Calcio': 'Cagliari',
 'Celta de Vigo': 'Celta Vigo',
 'Chievo Verona': 'Chievo',
 'Cádiz CF': 'Cadiz',
 'Dep. La Coruña': 'Deportivo La Coruna',
 'E. Frankfurt': 'Eintracht Frankfurt',
 'Elche CF': 'Elche',
 'F. Düsseldorf': 'Fortuna Dusseldorf',
 'FC Empoli': 'Empoli',
 'FC Lorient': 'Lorient',
 'FC Metz': 'Metz',
 'FC Nantes': 'Nantes',
 'G. Bordeaux': 'Bordeaux',
 'Greuther Fürth': 'SpVgg Greuther Furth',
 'Hellas Verona': 'Verona',
 'Hertha BSC': 'Hertha Berlin',
 'LOSC Lille': 'Lille',
 'Man City': 'Manchester City',
 'Man Utd': 'Manchester United',
 'Málaga CF': 'Malaga',
 'Nîmes Olympique': 'Nimes',
 'OGC Nice': 'Nice',
 'Olympique Lyon': 'Lyon',
 'Paris SG': 'Paris Saint Germain',
 'R. Strasbourg': 'Strasbourg',
 'RCD Mallorca': 'Mallorca',
 'Real Valladolid': 'Valladolid',
 'SC Paderborn': 'SC Paderborn 07',
 'SD Eibar': 'Eibar',
 'SD Huesca': 'Huesca',
 'SM Caen': 'Caen',
 'SPAL': 'Spal',
 'SSC Napoli': 'Napoli',
 'Saint-Étienne': 'Saint Etienne',
 'Sevilla FC': 'Sevilla',
 'Sheff Utd': 'Sheffield Utd',
 'Spezia Calcio': 'Spezia',
 'Spurs': 'Tottenham',
 'Stade Brestois': 'Stade Brestois 29',
 'Stade Reims': 'Reims',
 'TSG Hoffenheim': '1899 Hoffenheim',
 'Troyes': 'Estac Troyes',
 'UD Las Palmas': 'Las Palmas',
 'Udinese Calcio': 'Udinese',
 'VfL Bochum': 'VfL BOCHUM',
}

# Local PostgreSQL Connection

In [33]:
POSTGRES_ADDRESS = 'localhost'
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres'
POSTGRES_PASSWORD = '###############'
POSTGRES_DBNAME = 'cs689_term_project'

postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(
    username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname=POSTGRES_DBNAME))

conn = create_engine(postgres_str)

# Get unique_clubs_data for dim_team

In [5]:
def get_clubs_data(league_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/competitions/get-table"
    headers = {
        "X-RapidAPI-Key": "#################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    container: dict = {}
    for year in seasons:
        clubs_data = pd.DataFrame()
        
        for league in league_ids:
            querystring = {"id": league, "seasonID": year, "domain":"com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            league_season_clubs = pd.json_normalize(response_json['table'])
            league_season_clubs['season'] = year
            
            if league == 'GB1':
                league_season_clubs['league'] = 'Premier League'
            elif league == 'ES1':
                league_season_clubs['league'] = 'La Liga'
            elif league == 'IT1':
                league_season_clubs['league'] = 'Serie A'
            elif league == 'L1':
                league_season_clubs['league'] = 'Bundesliga'
            else:
                league_season_clubs['league'] = 'Ligue 1'
            clubs_data = pd.concat(objs=[clubs_data, league_season_clubs])
            
        columns: list = ['id', 'clubName', 'rank', 'markDescription', 'season', 'league']
        clubs_data_cleaned: pd.DataFrame = clubs_data[columns]
        clubs_data_renamed: pd.DataFrame = clubs_data_cleaned.rename(
            columns={'id': 'club_id', 'clubName': 'club_name', 'rank': 'league_position', 'markDescription': f'{year}_status'}
        )
        clubs_data_final: pd.DataFrame = clubs_data_renamed.reset_index(drop=True)
        clubs_data_final.loc[clubs_data_final[f'{year}_status'] == '', f'{year}_status'] = 'UEFA Champions Legue'
        clubs_data_final.loc[clubs_data_final['league_position'] == 1, f'{year}_status'] = 'UEFA Champions Legue'
        container[year] = clubs_data_final
    return container

container: dict = get_clubs_data(league_ids=league_ids, seasons=seasons)

In [6]:
container['2017'].to_csv(path_or_buf='2017_season_data_new.csv', index=False)
container['2018'].to_csv(path_or_buf='2018_season_data_new.csv', index=False)
container['2019'].to_csv(path_or_buf='2019_season_data_new.csv', index=False)
container['2020'].to_csv(path_or_buf='2020_season_data_new.csv', index=False)
container['2021'].to_csv(path_or_buf='2021_season_data_new.csv', index=False)
container['2022'].to_csv(path_or_buf='2022_season_data_new.csv', index=False)

In [8]:
def get_unique_clubs(league_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/competitions/get-table"
    headers = {
        "X-RapidAPI-Key": "#########################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    clubs_data = pd.DataFrame()
    for year in seasons:
        for league in league_ids:
            querystring = {"id": league, "seasonID": year, "domain":"com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            league_season_clubs = pd.json_normalize(response_json['table'])
            
            if league == 'GB1':
                league_season_clubs['league'] = 'Premier League'
            elif league == 'ES1':
                league_season_clubs['league'] = 'La Liga'
            elif league == 'IT1':
                league_season_clubs['league'] = 'Serie A'
            elif league == 'L1':
                league_season_clubs['league'] = 'Bundesliga'
            else:
                league_season_clubs['league'] = 'Ligue 1'
                
            clubs_data = pd.concat(objs=[clubs_data, league_season_clubs])
    columns: list = [
        'id',
        'clubName',
        'league',
    ]
    clubs_data_prefinal: pd.DataFrame = clubs_data[columns].rename(columns={'league': 'league_name'})
    clubs_data_final: pd.DataFrame = clubs_data_prefinal.drop_duplicates(subset=['clubName'])
    return clubs_data_final

unique_clubs_data: pd.DataFrame = get_unique_clubs(league_ids=league_ids, seasons=seasons)

In [9]:
unique_clubs_data.to_csv(path_or_buf='unique_clubs_from_transfers_api_new.csv', index=False)

In [11]:
for season in seasons:
    unique_clubs_data: pd.DataFrame = unique_clubs_data.merge(
        right=container[season], 
        left_on=['id'], 
        right_on=['club_id'], 
        how='left',
    )
unique_clubs_data.head()

  unique_clubs_data: pd.DataFrame = unique_clubs_data.merge(
  unique_clubs_data: pd.DataFrame = unique_clubs_data.merge(


Unnamed: 0,id,clubName,league_name,club_id_x,club_name_x,league_position_x,2022_status,season_x,league_x,club_id_y,club_name_y,league_position_y,2021_status,season_y,league_y,club_id_x.1,club_name_x.1,league_position_x.1,2020_status,season_x.1,league_x.1,club_id_y.1,club_name_y.1,league_position_y.1,2019_status,season_y.1,league_y.1,club_id_x.2,club_name_x.2,league_position_x.2,2018_status,season_x.2,league_x.2,club_id_y.2,club_name_y.2,league_position_y.2,2017_status,season_y.2,league_y.2
0,11,Arsenal,Premier League,11,Arsenal,1.0,UEFA Champions Legue,2022,Premier League,11,Arsenal,5.0,UEFA Europa League,2021,Premier League,11,Arsenal,8.0,,2020,Premier League,11,Arsenal,8.0,,2019,Premier League,11,Arsenal,5.0,UEFA Europa League,2018,Premier League,11,Arsenal,6.0,UEFA Europa League,2017,Premier League
1,281,Man City,Premier League,281,Man City,2.0,UEFA Champions Legue,2022,Premier League,281,Man City,1.0,UEFA Champions Legue,2021,Premier League,281,Man City,1.0,UEFA Champions Legue,2020,Premier League,281,Man City,2.0,UEFA Champions Legue,2019,Premier League,281,Man City,1.0,UEFA Champions Legue,2018,Premier League,281,Man City,1.0,UEFA Champions Legue,2017,Premier League
2,762,Newcastle,Premier League,762,Newcastle,3.0,UEFA Champions Legue,2022,Premier League,762,Newcastle,11.0,,2021,Premier League,762,Newcastle,12.0,,2020,Premier League,762,Newcastle,13.0,,2019,Premier League,762,Newcastle,13.0,,2018,Premier League,762,Newcastle,10.0,,2017,Premier League
3,148,Spurs,Premier League,148,Spurs,4.0,UEFA Champions Legue,2022,Premier League,148,Spurs,4.0,UEFA Champions Legue,2021,Premier League,148,Spurs,7.0,UEFA Europa Conference League Qualifikation,2020,Premier League,148,Spurs,6.0,UEFA Europa League,2019,Premier League,148,Spurs,4.0,UEFA Champions Legue,2018,Premier League,148,Spurs,3.0,UEFA Champions Legue,2017,Premier League
4,985,Man Utd,Premier League,985,Man Utd,5.0,UEFA Europa League,2022,Premier League,985,Man Utd,6.0,UEFA Europa League,2021,Premier League,985,Man Utd,2.0,UEFA Champions Legue,2020,Premier League,985,Man Utd,3.0,UEFA Champions Legue,2019,Premier League,985,Man Utd,6.0,UEFA Europa League,2018,Premier League,985,Man Utd,2.0,UEFA Champions Legue,2017,Premier League


# dim_team (SCD 3)

In [27]:
dim_team_columns: list = [
    'id', 
    'clubName', 
    'league_name', 
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
    '2022_status',
]
dim_team: pd.DataFrame = unique_clubs_data[dim_team_columns].rename(columns={'clubName': 'club_name'})

# with open('club_mappings.json') as json_file:
#     club_mappings = json.load(json_file)
    
dim_team['new_club_name'] = dim_team['club_name'].map(club_mappings).fillna(dim_team['club_name'])

columns_dim_team_part1: list = [
    'id',
    'new_club_name',
    'league_name', 
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
    '2022_status',
]
dim_team_part1: pd.DataFrame = dim_team[columns_dim_team_part1].rename(
    columns={'id': 'tm_api_club_id', 'new_club_name': 'club_name'}
)

dim_team_og: pd.DataFrame = pd.read_csv(filepath_or_buffer='dim_team_og_new.csv')
dim_team_merged: pd.DataFrame = dim_team_og.merge(right=dim_team_part1, how='left', left_on='team_name', right_on='club_name')

dim_team_final_columns: list = [
    'team_key',
    'team_name',
    'league_name_x',
    'league_country',
    'team_logo_url',
    'tm_api_club_id',
    '2017_status', 
    '2018_status', 
    '2019_status', 
    '2020_status', 
    '2021_status',
    '2022_status',
]
dim_team_final: pd.DataFrame = dim_team_merged[dim_team_final_columns].rename(columns={'league_name_x': 'league_name'})

# dim_team_final['2023_status'] = np.nan
# dim_team_final['2024_status'] = np.nan
# dim_team_final['2025_status'] = np.nan
# dim_team_final['2026_status'] = np.nan
# dim_team_final['2027_status'] = np.nan
# dim_team_final['2028_status'] = np.nan
# dim_team_final['2029_status'] = np.nan
# dim_team_final['2030_status'] = np.nan

dim_team_final.head()

Unnamed: 0,team_key,team_name,league_name,league_country,team_logo_url,tm_api_club_id,2017_status,2018_status,2019_status,2020_status,2021_status,2022_status
0,0,Crystal Palace,Premier League,England,https://media.api-sports.io/football/teams/52.png,873,,,,,,
1,1,Fulham,Premier League,England,https://media.api-sports.io/football/teams/36.png,931,,Relegated,,Relegated,,
2,2,Bournemouth,Premier League,England,https://media.api-sports.io/football/teams/35.png,989,,,Relegated,,,
3,3,Leeds,Premier League,England,https://media.api-sports.io/football/teams/63.png,399,,,,,,
4,4,Leicester,Premier League,England,https://media.api-sports.io/football/teams/46.png,1003,,,UEFA Europa League,UEFA Europa League,,


In [29]:
dim_team_final.to_csv(path_or_buf='dim_team_final_og_new.csv', index=False)

# Arrivals and Departures Data for fct_fixtures_raw

In [19]:
team_ids: list = unique_clubs_data['id'].tolist()

def get_transfers_data(team_ids: list, seasons: list):
    url = "https://transfermarket.p.rapidapi.com/transfers/list-by-club"
    headers = {
        "X-RapidAPI-Key": "####################################",
        "X-RapidAPI-Host": "transfermarket.p.rapidapi.com"
    }
    transfers_arrivals_data = pd.DataFrame()
    transfers_departures_data = pd.DataFrame()
    for year in seasons:
        time.sleep(1)
        for team in team_ids:
            querystring = {"id": team, "seasonID": year ,"domain": "com"}
            response = requests.request(
                method="GET", 
                url=url, 
                headers=headers, 
                params=querystring,
            )
            response_json = json.loads(response.text)
            
            team_season_transfers_arrivals = pd.json_normalize(response_json['currentSeason']['transferArrivals'])
            team_season_transfers_arrivals['team_id'] = team
            team_season_transfers_arrivals['season'] = year
            transfers_arrivals_data = pd.concat(objs=[transfers_arrivals_data, team_season_transfers_arrivals])
            
            team_season_transfers_departures = pd.json_normalize(response_json['currentSeason']['transferDepartures'])
            team_season_transfers_departures['team_id'] = team
            team_season_transfers_departures['season'] = year
            transfers_departures_data = pd.concat(objs=[transfers_departures_data, team_season_transfers_departures])
    return transfers_arrivals_data, transfers_departures_data

transfers_arrivals_data, transfers_departures_data = get_transfers_data(team_ids=team_ids, seasons=seasons)

In [20]:
transfers_arrivals_data.head()

Unnamed: 0,id,playerName,playerImage,age,position,transferFee,transferFeeCurrency,transferFeeNumeral,transferFeeUnformatted,loan,date,positionsdetail,clubID,clubName,clubImage,countryImage,team_id,season
0,285845,A. Maitland-Niles,https://img.a.transfermarkt.technology/portrai...,25.0,Mittelfeld,-,,,0.0,war,"May 31, 2023",Right Midfield,180,Southampton,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
1,389253,Auston Trusty,https://img.a.transfermarkt.technology/portrai...,24.0,Abwehr,-,,,0.0,war,"May 31, 2023",Centre-Back,337,Birmingham,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
2,203853,O. Zinchenko,https://img.a.transfermarkt.technology/portrai...,25.0,Abwehr,3500,€,m,35000000.0,,"Jul 22, 2022",Left-Back,281,Man City,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
3,389253,Auston Trusty,https://img.a.transfermarkt.technology/portrai...,24.0,Abwehr,-,,,0.0,war,"Jul 14, 2022",Centre-Back,1247,Colorado,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
4,363205,Gabriel Jesus,https://img.a.transfermarkt.technology/portrai...,25.0,Sturm,5220,€,m,52200000.0,,"Jul 4, 2022",Centre-Forward,281,Man City,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022


In [22]:
transfers_arrivals_data.to_csv(path_or_buf='transfers_arrivals_data_new.csv', index=False)

In [21]:
transfers_departures_data.head()

Unnamed: 0,id,playerName,playerImage,age,position,transferFee,transferFeeCurrency,transferFeeNumeral,transferFeeUnformatted,loan,date,positionsdetail,clubID,clubName,clubImage,countryImage,team_id,season
0,191217,H. Bellerín,https://img.a.transfermarkt.technology/portrai...,27.0,Abwehr,ablöse- frei,,,0.0,,"Sep 1, 2022",Right-Back,131,Barcelona,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
1,285845,A. Maitland-Niles,https://img.a.transfermarkt.technology/portrai...,25.0,Mittelfeld,?,,,0.0,ist,"Sep 1, 2022",Right Midfield,180,Southampton,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
2,343052,Nicolas Pépé,https://img.a.transfermarkt.technology/portrai...,27.0,Sturm,?,,,0.0,ist,"Aug 25, 2022",Right Winger,417,OGC Nice,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
3,205657,R. Rúnarsson,https://img.a.transfermarkt.technology/portrai...,27.0,Torwart,?,,,0.0,ist,"Aug 15, 2022",Goalkeeper,11282,Alanyaspor,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022
4,210178,Pablo Marí,https://img.a.transfermarkt.technology/portrai...,29.0,Abwehr,?,,,0.0,ist,"Aug 11, 2022",Centre-Back,2919,Monza,https://tmssl.akamaized.net/images/wappen/medi...,https://tmssl.akamaized.net/images/flagge/very...,11,2022


In [23]:
transfers_departures_data.to_csv(path_or_buf='transfers_departures_data_new.csv', index=False)

In [25]:
transfers_arrivals_data['transfer_type'] = 'arrival'
transfers_departures_data['transfer_type'] = 'departure'

concat_frames: list = [transfers_arrivals_data, transfers_departures_data]
fct_transfers_raw: pd.DataFrame = pd.concat(objs=concat_frames, ignore_index=True)

In [48]:
fct_transfers_raw.to_sql(name='transfers_raw_new', con=conn, index=False, if_exists='replace')

367

# dim_details

In [37]:
dim_details_raw_columns: list = ['playerName', 'clubName', 'transfer_type', 'team_id']
dim_details_raw: pd.DataFrame = fct_transfers_raw[dim_details_raw_columns]
dim_details_no_dup: pd.DataFrame = dim_details_raw.drop_duplicates(subset=['playerName']).reset_index(drop=True).rename(
    columns={'playerName': 'player_name', 'clubName': 'to/from_club_name', 'team_id': 'tm_api_club_id_origin'}
)
dim_details_no_dup['details_key'] = dim_details_no_dup.index + 500
dim_details_columns: list = ['details_key', 'player_name', 'to/from_club_name', 'transfer_type', 'tm_api_club_id_origin']
dim_details: pd.DataFrame = dim_details_no_dup[dim_details_columns]

dim_details.head()

Unnamed: 0,details_key,player_name,to/from_club_name,transfer_type,tm_api_club_id_origin
0,500,A. Maitland-Niles,Southampton,arrival,11
1,501,Auston Trusty,Birmingham,arrival,11
2,502,O. Zinchenko,Man City,arrival,11
3,503,Gabriel Jesus,Man City,arrival,11
4,504,Matt Turner,New England,arrival,11


In [41]:
dim_details.shape

(9407, 5)

In [40]:
dim_details.to_sql(name='stg_details', con=conn, index=False, if_exists='replace')

407

# dim_transfer_date

In [45]:
dim_transfer_date_raw_columns: list = ['date']
dim_transfer_date_raw: pd.DataFrame = fct_transfers_raw[dim_transfer_date_raw_columns]
dim_transfer_date_no_dup: pd.DataFrame = dim_transfer_date_raw.drop_duplicates(subset=['date']).reset_index(drop=True)
dim_transfer_date_no_dup['transfer_date_key'] = dim_transfer_date_no_dup.index + 1_000

dim_transfer_date_no_dup['datetime'] = pd.to_datetime(arg=dim_transfer_date_no_dup['date'], format='%b %d, %Y', errors='coerce')
dim_transfer_date_no_dup['year'] = dim_transfer_date_no_dup['datetime'].dt.year
dim_transfer_date_no_dup['month'] = dim_transfer_date_no_dup['datetime'].dt.month
dim_transfer_date_no_dup['day'] = dim_transfer_date_no_dup['datetime'].dt.day

dim_transfer_date_columns: list = [
    'transfer_date_key',
    'date',
    'datetime',
    'year',
    'month',
    'day',
]
dim_transfer_date: pd.DataFrame = dim_transfer_date_no_dup[dim_transfer_date_columns]

print(dim_transfer_date.shape)
dim_transfer_date.head()

(1170, 6)


Unnamed: 0,transfer_date_key,date,datetime,year,month,day
0,1000,"May 31, 2023",2023-05-31,2023.0,5.0,31.0
1,1001,"Jul 22, 2022",2022-07-22,2022.0,7.0,22.0
2,1002,"Jul 14, 2022",2022-07-14,2022.0,7.0,14.0
3,1003,"Jul 4, 2022",2022-07-04,2022.0,7.0,4.0
4,1004,"Jul 1, 2022",2022-07-01,2022.0,7.0,1.0


In [46]:
dim_transfer_date.to_sql(name='stg_transfer_date', con=conn, index=False, if_exists='replace')

170