In [1]:
import pandas as pd
import numpy as np
import duckdb
import http.client
from datetime import datetime, timedelta
import functions.fbref_functions as func
import functions.kdrive_functions as kdrive
import tempfile
from dotenv import load_dotenv
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
## Load dotenv variables
load_dotenv()

#Initialize duckdb with postgres connector    
cursor = duckdb.connect()
cursor.sql("INSTALL postgres;")
cursor.sql("LOAD postgres;")
cursor.sql(f"""
           ATTACH 'dbname=football user={os.getenv("PG_SPORTANALYTICS_USER")} password={os.getenv("PG_SPORTANALYTICS_PASSWORD")} 
           host={os.getenv("PG_SPORTANALYTICS_HOST")}' AS postgres_db (TYPE POSTGRES);
           """)

In [3]:
competition_url = 'https://fbref.com/en/comps/22/history/Major-League-Soccer-Seasons'
fb_stats = func.fbrefStats(competition_url)
df = fb_stats.get_fixtures()

In [4]:
df.tail()

Unnamed: 0,round,wk,day,date,time,home,xg,score,xg.1,away,...,referee,match_report,notes,url,match_id,home_id,away_id,competition,season,inserted_timestamp
356,Regular Season,27,Sat,2024-07-20,19:30,Vancouver W'caps,3.3,3–4,1.2,Dynamo FC,...,Fotis Bazakos,Match Report,,https://fbref.com/en/matches/43d2d102/Vancouve...,43d2d102,ab41cb90,0d885416,Major League Soccer,2024,2024-07-24 19:29:58.718922
357,Regular Season,27,Sat,2024-07-20,19:30,Minnesota Utd,1.8,2–0,1.4,SJ Earthquakes,...,Ismir Pekmic,Match Report,,https://fbref.com/en/matches/67f48250/Minnesot...,67f48250,99ea75a6,ca460650,Major League Soccer,2024,2024-07-24 19:29:58.718922
358,Regular Season,27,Sat,2024-07-20,19:30,Rapids,2.2,3–2,0.7,RSL,...,Ramy Touchan,Match Report,,https://fbref.com/en/matches/c065ff91/Colorado...,c065ff91,415b4465,f7d86a43,Major League Soccer,2024,2024-07-24 19:29:58.718922
359,Regular Season,27,Sat,2024-07-20,19:30,Seattle,1.9,0–3,1.6,LAFC,...,Allen Chapman,Match Report,,https://fbref.com/en/matches/ce8fd7d6/Seattle-...,ce8fd7d6,6218ebd4,81d817a3,Major League Soccer,2024,2024-07-24 19:29:58.718922
360,Regular Season,27,Sat,2024-07-20,19:45,LA Galaxy,1.8,3–2,1.2,Portland Timbers,...,Boiko Serhii,Match Report,,https://fbref.com/en/matches/f6e88d1c/LA-Galax...,f6e88d1c,d8b46897,d076914e,Major League Soccer,2024,2024-07-24 19:29:58.718922


In [5]:
table_name = 'fixtures'
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM df WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

# Create Team Table

In [6]:
teams = fb_stats.get_teams(df)

In [7]:
table_name = 'teams'
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM teams WHERE team_id NOT IN (SELECT team_id FROM postgres_db.{table_name});")

# Create Scores Table

In [8]:
scores = fb_stats.transform_scores(df)

In [9]:
table_name = 'scores'
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM scores WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

# Get Match Details

In [291]:
match_details = fb_stats.get_match_details(df.loc[355:])

# Goalkeeper Match Stats

In [292]:
gk_stats = match_details[1]

In [293]:
table_name = 'gk_stats'
# cursor.sql(f"CREATE TABLE IF NOT EXISTS postgres_db.{table_name} AS SELECT * FROM gk_stats;")
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM gk_stats WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

# Shot Stats

In [294]:
shots = match_details[0]

In [295]:
shots.head()

Unnamed: 0,minute,player,squad,xg,psxg,outcome,distance,bodypart,notes,assist_player1,assist1,assist_player2,assist2,team_id,match_id,goalkeeper,competition,season
0,4,Cedric Teuchert,St. Louis,0.08,,Blocked,22,Right Foot,,,,,,bd97ac1f,3479c662,Tim Melia,Major League Soccer,2024
1,5,Akil Watts,St. Louis,0.02,,Off Target,26,Right Foot,,Tomas Totland,Pass (Dead),Eduard Löwen,Pass (Live),bd97ac1f,3479c662,Tim Melia,Major League Soccer,2024
2,9,Johnny Russell,Sporting KC,0.04,0.11,Saved,17,Right Foot,,Dániel Sallói,Pass (Live),Erik Thommy,Pass (Live),4acb0537,3479c662,Roman Bürki,Major League Soccer,2024
3,9,Tim Leibold,Sporting KC,0.04,0.37,Saved,26,Right Foot,,Johnny Russell,Shot,,,4acb0537,3479c662,Roman Bürki,Major League Soccer,2024
4,19,Robert Castellanos,Sporting KC,0.04,0.11,Saved,7,Head,,Johnny Russell,Pass (Dead),William Agada,Fouled,4acb0537,3479c662,Roman Bürki,Major League Soccer,2024


In [296]:
table_name = 'shots'
# cursor.sql(f"CREATE TABLE IF NOT EXISTS postgres_db.{table_name} AS SELECT * FROM shots;")
cursor.sql(f"INSERT INTO postgres_db.{table_name} SELECT * FROM shots WHERE match_id NOT IN (SELECT match_id FROM postgres_db.{table_name});")

In [162]:
for i, row in df.loc[:1].iterrows():
    # Declare variables match_id and url
    match_id = df['match_id'][i]
    url = df['url'][i]
    home_team = df['home_id'][i]

    # Read html output from match url
    html_output = pd.read_html(url, extract_links='body')


    ### Extract goalkeeper statistics from both goalkeeper
    gk_columns = ['player', 'age', 'min', 'shots', 'goals',
    'saves', 'save_perc', 'psxg', 'launch_completion', 
    'launch_attempts', 'launch_comp_percentage', 
    'pass_attempt', 'throws', 'launch_percentage', 
    'pass_average_length', 'goalkicks', 'goalkicks_launched_percentage',
    'goalkicks_average_length', 'crosses', 'crosses_stopped', 
    'crosses_stopped_percentage', 'actions_outside_penaltyarea', 'actions_average_distance']
    
    # Create home goalkeeper dataframe
    gk1_exp = html_output[9].explode(list(html_output[9].columns)).drop(["('Unnamed: 1_level_0',   'Nation')"], axis=1, errors='ignore')
    gk1_output = gk1_exp.groupby(gk1_exp.index).first()
    
    # Create away goalkeeper dataframe
    gk2_exp = html_output[16].explode(list(html_output[16].columns)).drop(["('Unnamed: 1_level_0', 'Nation')"], axis=1, errors='ignore')
    gk2_output = gk2_exp.groupby(gk2_exp.index).first()

    # # Combine home and away GK dataframes
    gk_all_output = pd.concat([gk1_output, gk2_output])
    # gk_all_output = gk_all_output.set_axis(gk_columns, axis=1)

In [55]:
gk_all_output.columns = gk_all_output.columns.map(lambda x: x[1])
gk_all_output.drop(columns=['Nation']).reset_index().drop(columns='index')

In [59]:
gk_all_output.drop(columns=['Nation']).reset_index().drop(columns='index')
# gk_all_output

Unnamed: 0,Player,Age,Min,SoTA,GA,Saves,Save%,PSxG,Cmp,Att,...,Launch%,AvgLen,Att.1,Launch%.1,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist
0,Hugo Lloris,37-060,90,7,1,6,100.0,1.7,6,20,...,48.4,41.9,8,62.5,42.0,13,1,7.7,0,7.0
1,Andrew Thomas,25-176,90,6,2,4,66.7,1.0,5,11,...,35.7,31.7,5,20.0,27.8,12,1,8.3,0,13.2


In [297]:
cursor.sql(f"SELECT * FROM df WHERE match_id NOT IN (SELECT match_id FROM postgres_db.shots);").df()

Unnamed: 0,round,wk,day,date,time,home,xg,score,xg.1,away,...,referee,match_report,notes,url,match_id,home_id,away_id,competition,season,inserted_timestamp
