In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [21]:
# Connect to the SQLite database
conn = sqlite3.connect("C:/Users/erknud3/fpl-optimization/model/FBRef_DB/master.db")

print("Loading data from the database...")

# Load data from Match table
fpl_master = pd.read_sql_query(
    """
    select *
    from fpl_master_24_25
    """,
    conn,
)

# Close the connection
conn.close()

Loading data from the database...


In [22]:
import unidecode

def clean_player_name(player_name):
    # Convert to lowercase
    player_name = player_name.lower()
    # Remove accents and special characters
    player_name = unidecode.unidecode(player_name)
    # Replace spaces with hyphens
    player_name = player_name.replace(" ", "-")
    return player_name

In [23]:
# Drop duplicates to ensure one URL per player based on the 'Player' and 'transfermarkt' columns
unique_players = fpl_master[['fbref_name', 'transfermarkt']].drop_duplicates()

# Generate the URLs for unique players
transfermarkt_url = [
    f"https://www.transfermarkt.com/{clean_player_name(player)}/profil/spieler/{int(transfermarkt_id)}"
    for player, transfermarkt_id in zip(unique_players['fbref_name'], unique_players['transfermarkt'])
]

In [24]:
len(transfermarkt_url)

594

In [84]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Define headers for the request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}

# Create an empty list to store the results
data = []

# Loop through each player URL to extract the joined date
for index, (player, transfermarkt_id) in enumerate(zip(unique_players['fbref_name'], unique_players['transfermarkt']), start=1):
    # Construct the URL for each player
    url = f"https://www.transfermarkt.com/{clean_player_name(player)}/profil/spieler/{int(transfermarkt_id)}"
    
    try:
        # Send the request
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract the joined date using regex
            joined_date = re.search(r"Joined:.*__content\">(.*?)</span>", str(soup))
            
            # If a match is found, clean the date format
            if joined_date:
                joined_date = joined_date.group(1)
                # Convert to 'YYYY-MM-DD' format
                joined_date = pd.to_datetime(joined_date, format='%b %d, %Y').strftime('%Y-%m-%d')
            else:
                joined_date = None
            
            # Append the data to the list (player name, transfermarkt id, joined date)
            data.append([player, transfermarkt_id, joined_date])
            
            # Print progress for successful extraction
            print(f"[{index}/{len(unique_players)}] Successfully extracted joined date for {player}: {joined_date}")
        
        else:
            print(f"[{index}/{len(unique_players)}] Failed to retrieve data for {player}. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"[{index}/{len(unique_players)}] Error occurred for {player}: {e}")

# Convert the collected data into a dataframe
new_joined_date_df = pd.DataFrame(data, columns=['fbref_name', 'transfermarkt', 'joined_date'])

# Display the dataframe
print(new_joined_date_df)

[1/3] Successfully extracted joined date for Marcus Rashford: None
[2/3] Successfully extracted joined date for Jadon Sancho: None
[3/3] Successfully extracted joined date for Joško Gvardiol: None
        fbref_name  transfermarkt joined_date
0  Marcus Rashford         258923        None
1     Jadon Sancho         401173        None
2   Joško Gvardiol         475959        None


In [78]:
joined_date_df = pd.concat([joined_date_df, new_joined_date_df])
joined_date_df = joined_date_df.drop_duplicates(subset=['fbref_name'], keep='last')

In [79]:
# Connect to the SQLite database
conn = sqlite3.connect("C:/Users/erknud3/fpl-optimization/model/FBRef_DB/master.db")
cursor = conn.cursor()

# Table name
player_joined_date_table = "player_joined_date"

# Function to handle table creation and data insertion
def create_or_replace_table(table_name, dataframe, cursor, conn):
    # Check if the table exists
    cursor.execute(
        f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';"
    )
    table_exists = cursor.fetchone()

    if table_exists:
        # If the table exists, truncate (delete all rows)
        cursor.execute(f"DELETE FROM {table_name};")
        print(f"Table '{table_name}' found. Truncating table...")

        # Insert data into the table
        dataframe.to_sql(table_name, conn, if_exists="append", index=False)
        print(f"Data inserted into table '{table_name}'.")
    else:
        # If the table does not exist, create it and insert data
        print(
            f"Table '{table_name}' not found. Creating table and inserting data..."
        )
        dataframe.to_sql(table_name, conn, if_exists="replace", index=False)
        print(f"Table '{table_name}' created and data inserted.")

# Create or replace tables for player_baselines and player_minutes
create_or_replace_table(player_joined_date_table, joined_date_df, cursor, conn)

# Commit the transaction and close the connection
conn.commit()
conn.close()

Table 'player_joined_date' found. Truncating table...
Data inserted into table 'player_joined_date'.


In [80]:
not_found = joined_date_df[joined_date_df['joined_date'].isnull()]

In [81]:
fpl_master_missing = fpl_master.merge(not_found[['fbref_name', 'joined_date']], on='fbref_name', how='inner')

In [82]:
# Drop duplicates to ensure one URL per player based on the 'Player' and 'transfermarkt' columns
unique_players = fpl_master_missing[['fbref_name', 'transfermarkt']].drop_duplicates()

# Generate the URLs for unique players
transfermarkt_url = [
    f"https://www.transfermarkt.com/{clean_player_name(player)}/profil/spieler/{int(transfermarkt_id)}"
    for player, transfermarkt_id in zip(unique_players['fbref_name'], unique_players['transfermarkt'])
]

In [83]:
transfermarkt_url

['https://www.transfermarkt.com/marcus-rashford/profil/spieler/258923',
 'https://www.transfermarkt.com/jadon-sancho/profil/spieler/401173',
 'https://www.transfermarkt.com/josko-gvardiol/profil/spieler/475959']

In [87]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect("C:/Users/erknud3/fpl-optimization/model/FBRef_DB/master.db")
cursor = conn.cursor()

# Define the updates for each transfermarkt ID
updates = [
    ('2016-01-01', 258923),
    ('2024-08-30', 401173),
    ('2023-08-05', 475959)
]

# Perform the updates
for joined_date, transfermarkt_id in updates:
    cursor.execute("""
        UPDATE player_joined_date 
        SET joined_date = ? 
        WHERE transfermarkt = ?;
    """, (joined_date, transfermarkt_id))

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Updates completed successfully!")

Updates completed successfully!
