In [1]:
import pandas as pd
import numpy as np
import duckdb
import http.client
from datetime import datetime, timedelta
import functions.fbref_functions as func
import functions.kdrive_functions as kdrive
import tempfile
from dotenv import load_dotenv
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
## Load dotenv variables
load_dotenv()

#Initialize duckdb with postgres connector    
cursor = duckdb.connect()
cursor.sql("INSTALL postgres;")
cursor.sql("LOAD postgres;")
cursor.sql(f"""
           ATTACH 'dbname=football user={os.getenv("PG_SPORTANALYTICS_USER")} password={os.getenv("PG_SPORTANALYTICS_PASSWORD")} 
           host={os.getenv("PG_SPORTANALYTICS_HOST")}' AS postgres_db (TYPE POSTGRES);
           """)

In [3]:
competition_url = 'https://fbref.com/en/comps/33/history/2-Bundesliga-Seasons'
fb_stats = func.fbrefStats(competition_url)
df = fb_stats.get_fixtures()

In [4]:
df.head()

Unnamed: 0,round,wk,day,date,time,home,xg,score,xg.1,away,...,referee,match_report,notes,url,match_id,home_id,away_id,competition,season,inserted_timestamp
0,Regular Season,1,Fri,2024-08-02,20:30,Köln,2.6,1–2,1.7,Hamburger SV,...,Max Burda,Match Report,,https://fbref.com/en/matches/ee0af24d/Koln-Ham...,ee0af24d,bc357bf7,26790c6a,2. Fußball-Bundesliga,2024-2025,2024-08-13 21:29:30.587800
1,Regular Season,1,Sat,2024-08-03,13:00,Hertha BSC,0.4,1–2,1.7,Paderborn 07,...,Wolfgang Haslberger,Match Report,,https://fbref.com/en/matches/6f5f7c69/Hertha-B...,6f5f7c69,2818f8bc,d9f93f02,2. Fußball-Bundesliga,2024-2025,2024-08-13 21:29:30.587800
2,Regular Season,1,Sat,2024-08-03,13:00,Karlsruher,1.9,3–2,1.6,Nürnberg,...,Robin Braun,Match Report,,https://fbref.com/en/matches/b85153a5/Karlsruh...,b85153a5,33ba9d7b,6f2c108c,2. Fußball-Bundesliga,2024-2025,2024-08-13 21:29:30.587800
3,Regular Season,1,Sat,2024-08-03,13:00,Hannover 96,2.7,2–0,0.4,Jahn R'burg,...,Michael Bacher,Match Report,,https://fbref.com/en/matches/c1cd58b8/Hannover...,c1cd58b8,60b5e41f,5cb328f2,2. Fußball-Bundesliga,2024-2025,2024-08-13 21:29:30.587800
4,Regular Season,1,Sat,2024-08-03,13:00,Magdeburg,1.5,0–0,0.4,Elversberg,...,Florian Heft,Match Report,,https://fbref.com/en/matches/c9c3d7eb/Magdebur...,c9c3d7eb,e18a73da,fe686760,2. Fußball-Bundesliga,2024-2025,2024-08-13 21:29:30.587800


In [5]:
table_name = 'fixtures'
matches_updates = cursor.sql(f"SELECT * FROM df WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});").df()
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM df WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

In [6]:
if matches_updates.empty:
    print("0")
else:
    print("1")

0


# Create Team Table

In [7]:
teams = fb_stats.get_teams(df)

In [8]:
table_name = 'teams'
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM teams WHERE team_id NOT IN (SELECT team_id FROM postgres_db.{table_name});")

# Create Scores Table

In [9]:
scores = fb_stats.transform_scores(df)

In [10]:
table_name = 'scores'
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM scores WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

# Get Match Details

In [7]:
match_details = fb_stats.get_match_details(matches_updates)

In [9]:
match_details[1]

Unnamed: 0,player,age,min,shots,goals,saves,save_perc,psxg,launch_completion,launch_attempts,...,goalkicks_launched_percentage,goalkicks_average_length,crosses,crosses_stopped,crosses_stopped_percentage,actions_outside_penaltyarea,actions_average_distance,match_id,competition,season


# Goalkeeper Match Stats

In [10]:
gk_stats = match_details[1]

In [11]:
table_name = 'gk_stats'
# cursor.sql(f"CREATE TABLE IF NOT EXISTS postgres_db.{table_name} AS SELECT * FROM gk_stats;")
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM gk_stats WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

# Shot Stats

In [12]:
shots = match_details[0]

In [13]:
shots.head()

Unnamed: 0,minute,player,squad,xg,psxg,outcome,distance,bodypart,notes,assist_player1,assist1,assist_player2,assist2,team_id,match_id,goalkeeper,competition,season


In [14]:
table_name = 'shots'
# cursor.sql(f"CREATE TABLE IF NOT EXISTS postgres_db.{table_name} AS SELECT * FROM shots;")
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM shots WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

In [162]:
for i, row in df.loc[:1].iterrows():
    # Declare variables match_id and url
    match_id = df['match_id'][i]
    url = df['url'][i]
    home_team = df['home_id'][i]

    # Read html output from match url
    html_output = pd.read_html(url, extract_links='body')


    ### Extract goalkeeper statistics from both goalkeeper
    gk_columns = ['player', 'age', 'min', 'shots', 'goals',
    'saves', 'save_perc', 'psxg', 'launch_completion', 
    'launch_attempts', 'launch_comp_percentage', 
    'pass_attempt', 'throws', 'launch_percentage', 
    'pass_average_length', 'goalkicks', 'goalkicks_launched_percentage',
    'goalkicks_average_length', 'crosses', 'crosses_stopped', 
    'crosses_stopped_percentage', 'actions_outside_penaltyarea', 'actions_average_distance']
    
    # Create home goalkeeper dataframe
    gk1_exp = html_output[9].explode(list(html_output[9].columns)).drop(["('Unnamed: 1_level_0',   'Nation')"], axis=1, errors='ignore')
    gk1_output = gk1_exp.groupby(gk1_exp.index).first()
    
    # Create away goalkeeper dataframe
    gk2_exp = html_output[16].explode(list(html_output[16].columns)).drop(["('Unnamed: 1_level_0', 'Nation')"], axis=1, errors='ignore')
    gk2_output = gk2_exp.groupby(gk2_exp.index).first()

    # # Combine home and away GK dataframes
    gk_all_output = pd.concat([gk1_output, gk2_output])
    # gk_all_output = gk_all_output.set_axis(gk_columns, axis=1)

In [55]:
gk_all_output.columns = gk_all_output.columns.map(lambda x: x[1])
gk_all_output.drop(columns=['Nation']).reset_index().drop(columns='index')

In [59]:
gk_all_output.drop(columns=['Nation']).reset_index().drop(columns='index')
# gk_all_output

Unnamed: 0,Player,Age,Min,SoTA,GA,Saves,Save%,PSxG,Cmp,Att,...,Launch%,AvgLen,Att.1,Launch%.1,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist
0,Hugo Lloris,37-060,90,7,1,6,100.0,1.7,6,20,...,48.4,41.9,8,62.5,42.0,13,1,7.7,0,7.0
1,Andrew Thomas,25-176,90,6,2,4,66.7,1.0,5,11,...,35.7,31.7,5,20.0,27.8,12,1,8.3,0,13.2


In [297]:
cursor.sql(f"SELECT * FROM df WHERE match_id NOT IN (SELECT match_id FROM postgres_db.shots);").df()

Unnamed: 0,round,wk,day,date,time,home,xg,score,xg.1,away,...,referee,match_report,notes,url,match_id,home_id,away_id,competition,season,inserted_timestamp


In [11]:
gk_columns = ['player', 'age', 'min', 'shots', 'goals',
'saves', 'save_perc', 'psxg', 'launch_completion', 
'launch_attempts', 'launch_comp_percentage', 
'pass_attempt', 'throws', 'launch_percentage', 
'pass_average_length', 'goalkicks', 'goalkicks_launched_percentage',
'goalkicks_average_length', 'crosses', 'crosses_stopped', 
'crosses_stopped_percentage', 'actions_outside_penaltyarea', 'actions_average_distance']

gk_stats = pd.DataFrame(columns=gk_columns)

shot_columns = ['minute', 'player', 'squad', 'xg', 'psxg', 'outcome', 
                'distance', 'bodypart', 'notes', 'assist_player1', 
                'assist1', 'assist_player2', 'assist2', 'team_id', 
                'match_id', 'goalkeeper', 'competition', 'season']



In [12]:
gk_stats

Unnamed: 0,player,age,min,shots,goals,saves,save_perc,psxg,launch_completion,launch_attempts,...,launch_percentage,pass_average_length,goalkicks,goalkicks_launched_percentage,goalkicks_average_length,crosses,crosses_stopped,crosses_stopped_percentage,actions_outside_penaltyarea,actions_average_distance
