In [43]:
import sqlite3
import requests
import pandas as pd
from sqlalchemy import create_engine
from unidecode import unidecode
import numpy as np
from fuzzywuzzy import fuzz, process
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from highlight_text import fig_text
from qbstyles import mpl_style
import requests
from bs4 import BeautifulSoup
import re
import time
import os

def extract_tables_from_url(url):
    try:
        tables = pd.read_html(url)  # Try Premier League URL first
        if not tables:  # If no table is found
            raise ValueError  # Raise an exception to be caught
    except:  # Catch the exception when no table is found
        url = url.replace('c9', 'c10')  # Change to Championship URL
        url = url.replace('Premier-League', 'Championship')
        tables = pd.read_html(url)  # Try Championship URL
    return tables

def get_team_name(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('title').get_text()
    if re.match('.*\d.*', title):
        name = title.split('Scores')[0].strip()
        name = name.split()[1:]
        name = ' '.join(name)
    else:
        name = title.split('Scores')[0].strip()
    return name

def lists_to_dfs(lists):
    dfs = []
    for lst in lists:
        # convert the list to a numpy array
        arr = np.array(lst)
        # reshape the numpy array into a two-dimensional array
        reshaped_arr = arr.reshape(-1, arr.shape[-1])
        cols = ['date','time','round','day','venue','result','gf','ga','opponent',
                    'xg','xga','possession','attendance','captain','formation','referee','match report','notes']
        # create a DataFrame from the reshaped data
        df = pd.DataFrame(reshaped_arr, columns=cols)
        dfs.append(df)
    # Concatenate all the dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def prepare_df(df):
    df = df.drop(columns=['time','day','captain','formation','referee','match report', 'notes','attendance'])
    df = df.dropna()
    df['date'] = pd.to_datetime(df['date'])
    df['gf'] = df['gf'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else x)
    df['ga'] = df['ga'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else x)
    df['xg'] = df['xg'].astype(float)
    df['xga'] = df['xga'].astype(float)
    df['possession'] = df['possession'].astype(int)
    df['xg_roll'] = df['xg'].rolling(window = 10, min_periods = 10).mean()
    df['xga_roll'] = df['xga'].rolling(window = 10, min_periods = 10).mean()
    return df

def sort_team_df(url):
    response = requests.get(url)
    if response.status_code == 429:
        retry_after = int(response.headers.get('Retry-After', 5))  # Default to 5 seconds if header is missing
        time.sleep(retry_after)
        response = requests.get(url)  # Retry the request

    team_name = get_team_name(url)
    table_extraction = extract_tables_from_url(url)
    time.sleep(5)  # Sleep for 5 seconds
    df = lists_to_dfs(table_extraction)
    df = prepare_df(df)
    return df

def replace_with_fuzzy_match(original, choices, scorer=fuzz.token_sort_ratio):
    new_val, score = process.extractOne(original, choices, scorer=scorer)
    return new_val

def fetch_and_process_fbref_data(url, type=2):
    # Fetch the data
    response = requests.get(url).text.replace('<!--', '').replace('-->', '')
    
    # Parse the tables from the HTML response
    df = pd.read_html(response, header=1)[type]
    
    # Clean up the data
    if type == 2:
        df = df[~df['Player'].isin(['Player'])]
        df['Nation'] = df['Nation'].str.extract('([A-Z]{3})')
        df['Pos'] = df['Pos'].str.split(',').str[0]
        df = df.drop('Matches', axis=1)
        df.fillna(0, inplace=True)
        
        # Reset the index
        df = df.reset_index(drop=True)
    # elif type == 1:
        #not sure if we should actually do anything
    
    return df

def update_names_gen(df):
    df = df.rename(columns={
        'PrgC': 'Progressive Carries',
        'PrgP': 'Progressive Passes',
        'Gls.1': 'Gls per 90',
        'Ast.1': 'Ast per 90',
        'G+A.1': 'G+A per 90',
        'G-PK.1': 'G-PK per 90',
        'G+A-PK': 'G+A-PK per 90',
        'xG.1': 'xG per 90',
        'xAG.1': 'xAG per 90',
        'npxG.1': 'npxG per 90',
        'npxG+xAG.1': 'npxG+xAG per 90'
    })
    return df

def update_names_sh(df):
    df = df.rename(columns={
        'MP' : '90s',
        'SoT%': 'SoT Pct',
        'Sh/90': 'Sh per 90',
        'SoT/90': 'SoT per 90',
        'G/Sh': 'Goals per Shot',
        'G/SoT': 'Goals per SoT',
        'Dist': 'Average Shot Distance',
        'npxG/Sh': 'npxG per Shot',
        'np:G-xG': 'npG - npxG',
    })
    return df

def update_names_pass(df):
    df = df.rename(columns={
        'Cmp': 'Passes Completed',
        'Att': 'Passes Attempted',
        'Cmp%': 'Pass Completion Pct',
        'TotDist': 'Total Passing Dist',
        'PrgDist': 'Progressive Passing Dist',
        'Cmp.1': 'Passes Completed (Short)',
        'Att.1': 'Passes Attempted (Short)',
        'Cmp%.1': 'Pass Completion Pct (Short)',
        'Cmp.2': 'Passes Completed (Medium)',
        'Att.2': 'Passes Attempted (Medium)',
        'Cmp%.2': 'Pass Completion Pct (Medium)',
        'Cmp.3': 'Passes Completed (Long)',
        'Att.3': 'Passes Attempted (Long)',
        'Cmp%.3': 'Pass Completion Pct (Long)',
        'KP': 'Key Passes',
        '1/3': 'Passes Into Final Third',
        'PPA': 'Passes Into Pen Area',
        'CrsPA': 'Crosses Into Pen Area',
        'PrgP': 'Progressive Passes'

    })
    return df

def update_names_passtype(df):
    df = df.rename(columns={
        'Att': 'Passes Attempted',
        'Live': 'Live-ball Passes',
        'Dead': 'Dead-ball Passes',
        'FK': 'Passes from FK',
        'TB': 'Through Balls',
        'Sw': 'Switches',
        'Crs': 'Crosses',
        'TI': 'Throw-Ins',
        'CK': 'Corner Kicks',
        'In': 'Inswinging CK',
        'Out': 'Outswinging CK',
        'Str': 'Straight CK',
        'Cmp': 'Passes Completed',
        'Off': 'Passes Offside',
        'Blocks': 'Passes Blocked'
    })
    return df

def update_names_gca(df):
    df = df.rename(columns={
        'SCA': 'Shot Creating Actions',
        'SCA90': 'Shot Creating Actions per 90',
        'PassLive': 'SCA PassLive',
        'PassDead': 'SCA PassDead',
        'TO': 'SCA Take-ons',
        'Sh': 'SCA Shots',
        'Fld': 'SCA Fouled',
        'Def': 'SCA Defensive Actions',
        'GCA': 'Goal Creating Actions',
        'GCA90': 'Goal Creating Actions per 90',
        'PassLive.1': 'GCA PassLive',
        'PassDead.1': 'GCA PassDead',
        'TO.1': 'GCA Take-ons',
        'Sh.1': 'GCA Shots',
        'Fld.1': 'GCA Fouled',
        'Def.1': 'GCA Defensive Actions'
    })
    return df

def update_names_def(df):
    df = df.rename(columns={
    'Tkl': 'Tackles',
    'TklW': 'Tackles Won',
    'Def 3rd': 'Tackles Def 3rd',
    'Mid 3rd': 'Tackles Mid 3rd',
    'Att 3rd': 'Tackles Att 3rd',
    'Tkl.1': 'Dribblers Tackled',
    'Att': 'Dribblers Challenged',
    'Tkl%': 'Dribblers Tackled Pct',
    'Lost': 'Dribblers Challenged Lost',
    'Blocks': 'Blocks',
    'Sh': 'Shots Blocked',
    'Pass': 'Passes Blocked',
    'Int': 'Interceptions',
    'Tkl+Int': 'Tackles and Interceptions',
    'Clr': 'Clearances',
    'Err': 'Errors Leading to Shots'
    })
    return df

def update_names_pos(df):
    df = df.rename(columns={
        'Def Pen': 'Touches (Def Pen)',
        'Def 3rd': 'Touches (Def 3rd)',
        'Mid 3rd': 'Touches (Mid 3rd)',
        'Att 3rd': 'Touches (Att 3rd)',
        'Att Pen': 'Touches (Att Pen)',
        'Live': 'Touches (Live-Ball)',
        'Att': 'Take-Ons Attempted',
        'Succ': 'Successful Take-Ons',
        'Succ%': 'Successful Take-On %',
        'Tkld': 'Times Tackled During Take-Ons',
        'Tkld%': 'Tackled During Take-On %',
        'TotDist': 'Total Carrying Distance',
        'PrgDist': 'Progressive Carrying Distance',
        'PrgC': 'Progressive Carries',
        '1/3': 'Carries Into Final 3rd',
        'CPA': 'Carries Into Penalty Area',
        'Mis': 'Miscontrols',
        'Dis': 'Disposessed',
        'Rec': 'Passes Recieved',
        'PrgR': 'Progressive Passes Receieved'
    })
    return df

def update_names_time(df):
    df = df.rename(columns={
    'Min': 'Mins',
    'Mn/MP': 'Minutes per Matches Played',
    'Min%': 'Minutes Played Pct',
    '90s': '90s',
    'Starts': 'Starts',
    'Mn/Start': 'Minutes Per Start',
    'Compl': 'Complete Matches Played',
    'Subs': 'Sub Apps',
    'Mn/Sub': 'Minutes Per Sub',
    'unSub': 'Unused Subs',
    'PPM': 'Points Per Match',
    'onG': 'Goals Scored (On Pitch)',
    'onGA': 'Goals Allowed (On Pitch)',
    '+/-': 'Scored Minus Allowed (On Pitch)',
    '+/-90': 'Scored Minus Allowed (On Pitch) per 90',
    'On-Off': 'Scored Minus Allowed Net (On Pitch) per 90',
    'onxG': 'xG (On Pitch)',
    'onxGA': 'xGA (On Pitch)',
    'xG+/-': 'xG Minus xGA (On Pitch)',
    'xG+/-90': 'xG Minus xGA (On Pitch) per 90',
    'On-Off.1': 'xG Minus xGA Net (On Pitch) per 90'
    })
    return df

def update_names_misc(df):
    df = df.rename(columns={
    'Fls': 'Fouls',
    'Fld': 'Fouled',
    'Off': 'Offsides',
    'Crs': 'Crosses',
    'Int': 'Interceptions',
    'TklW': 'Tackles Won',
    'PKwon': 'PK Won',
    'PKcon': 'PK Conceded',
    'OG': 'Own Goals',
    'Recov': 'Balls Recovered',
    'Won': 'Aerial Duels Won',
    'Lost': 'Aerial Duels Lost',
    'Won%': 'Aerial Duels Win Pct'
    })
    return df

# Function to replace special characters in a player name
def replace_special_characters(name):
    if isinstance(name, str):
        return unidecode(name)
    return name

def replace_special_characters(name):
    if isinstance(name, str):
        return unidecode(name)
    return name

def convert_to_years(age_str):
    try:
        # Split the string on hyphen and return the first part as integer
        return int(age_str.split('-')[0])
    except:
        return age_str

def sort_df(df, season):
    df.drop_duplicates(subset='Player', keep='first', inplace=True)
    df = df.loc[:, ~df.columns.duplicated()]
    df['Age'] = df['Age'].apply(convert_to_years)
    df.loc[:, 'Age':] = df.loc[:, 'Age':].astype(float)
    df['Player'] = df['Player'].apply(replace_special_characters)
    df['season_years'] = season
    return df

In [44]:
################################################################
#overall player data
################################################################
# 'fbref_data_players_latest.db' - latest season
# 'fbref_data_players_archive.db' - all years before

# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")


seasons = ['2023-2024'] #['2020-2021', '2021-2022', '2022-2023']  
# db_name = 'fbref_data_players_latest.db'
db_name = os.path.join(new_directory, 'fbref_data_players_latest.db')
table_names = ['general', 'keepers', 'keepers_adv', 'shooting', 'passing', 
                'passing_types', 'gca', 'defense', 'possession', 'playingtime', 'misc']
engine = create_engine(f'sqlite:///{db_name}')

for season in seasons:
    print(season)
    urls = [
        f'https://fbref.com/en/comps/9/{season}/stats/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/keepers/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/keepersadv/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/shooting/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/passing/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/passing_types/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/gca/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/defense/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/possession/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/playingtime/{season}-Premier-League-Stats',
        f'https://fbref.com/en/comps/9/{season}/misc/{season}-Premier-League-Stats'
    ]

    # Fetch, process and store the data
    for url, table_name in zip(urls, table_names):
        df = fetch_and_process_fbref_data(url,2)
        if table_name == table_names[0]:
            df = update_names_gen(df)
        elif table_name == table_names[3]:
            df = update_names_sh(df)
        elif table_name == table_names[4]:
            df = update_names_pass(df)
        elif table_name == table_names[5]:
            df = update_names_passtype(df)
        elif table_name == table_names[6]:
            df = update_names_gca(df)
        elif table_name == table_names[7]:
            df = update_names_def(df)
        elif table_name == table_names[8]:
            df = update_names_pos(df)
        elif table_name == table_names[9]:
            df = update_names_time(df)
        elif table_name == table_names[10]:
            df = update_names_misc(df)
        df = sort_df(df, season)
        if season == '2023-2024':
            df.to_sql(table_name, con=engine, if_exists='replace')
        else:
            df.to_sql(table_name, con=engine, if_exists='append')

print('done')

2023-2024
done


In [45]:
################################################################
#overall team data - for (0) and against (1)
################################################################
# team_data_type = 1
# if team_data_type == 0:
#     status = 'for'
# elif team_data_type == 1:
#     status = 'against'

season = '2023-2024' #2020-2021 2021-2022 2022-2023
urls = [
    f'https://fbref.com/en/comps/9/{season}/stats/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/keepers/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/keepersadv/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/shooting/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/passing/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/passing_types/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/gca/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/defense/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/possession/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/playingtime/{season}-Premier-League-Stats',
    f'https://fbref.com/en/comps/9/{season}/misc/{season}-Premier-League-Stats'
]

table_names = ['general', 'keepers', 'keepers_adv', 'shooting', 'passing', 
                'passing_types', 'gca', 'defense', 'possession', 'playingtime', 'misc']


# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")

# Fetch, process and store the data
for team_data_type in range(0,2):
    if team_data_type == 0:
        status = 'for'
    elif team_data_type == 1:
        status = 'against'

    print(team_data_type)

    # Create a new SQLite database (or connect to an existing one)
    db_name = os.path.join(new_directory, f'fbref_{status}_team_data_overall_{season}.db')
    engine = create_engine(f'sqlite:///{db_name}')

    for url, table_name in zip(urls, table_names):
        print(table_name)
        df = fetch_and_process_fbref_data(url,team_data_type)
        df['Squad'] = df['Squad'].str.replace("vs ", "", case=False)
        if table_name == table_names[0]:
            df = update_names_gen(df)
        elif table_name == table_names[3]:
            df = update_names_sh(df)
        elif table_name == table_names[4]:
            df = update_names_pass(df)
        elif table_name == table_names[5]:
            df = update_names_passtype(df)
        elif table_name == table_names[6]:
            df = update_names_gca(df)
        elif table_name == table_names[7]:
            df = update_names_def(df)
        elif table_name == table_names[8]:
            df = update_names_pos(df)
        elif table_name == table_names[9]:
            df = update_names_time(df)
        elif table_name == table_names[10]:
            df = update_names_misc(df)
        df.to_sql(f'teams_{status}_{table_name}', con=engine, if_exists='replace')

print('done')

0
general
keepers
keepers_adv
shooting
passing
passing_types
gca
defense
possession
playingtime
misc
1
general
keepers
keepers_adv
shooting
passing
passing_types
gca
defense
possession
playingtime
misc
done


In [46]:
########################################################################################
# team individuals - to calculate xg and xa on a game by game basis
########################################################################################

years = ['2023-2024'] #['2023-2024'] #'2020-2021', '2021-2022', '2022-2023', 

# team_code =['943e8050', 'e4a775cb']
# team_name =['Burnley', 'Nottingham-Forest']

# team_code = ['7c21e445', '8cec06e1']
# team_name = ['West-Ham-United', 'Wolverhampton-Wanderers']

team_code =['18bb7c10', '8602292d','4ba7cbea', 'cd051869', 'd07537b9',
            'cff3d9bb', '47c64c55','d3fd31cc','fd962109', 
            '822bd0ba', 'b8fd03ef','19538871', 'b2b47a98',
            'e4a775cb',  '361ca564', '7c21e445', '8cec06e1',
            '943e8050', 'e297cd13', '1df6b87e']
team_name =['Arsenal', 'Aston-Villa', 'Bournemouth', 'Brentford', 'Brighton-and-Hove-Albion',
            'Chelsea', 'Crystal-Palace', 'Everton', 'Fulham',
            'Liverpool', 'Manchester-City','Manchester-United', 'Newcastle-United',
            'Nottingham-Forest', 'Tottenham-Hotspur', 'West-Ham-United', 'Wolverhampton-Wanderers', 
            'Burnley', 'Luton-Town', 'Sheffield-United']

# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")

# Create a new SQLite database (or connect to an existing one)
db_name = os.path.join(new_directory, 'fbref_data_team_individual.db')
engine = create_engine(f'sqlite:///{db_name}')

for year in years:
    for idx, code in enumerate(team_code):
        name = team_name[idx]
        url = f"https://fbref.com/en/squads/{code}/{year}/matchlogs/c9/schedule/{name}-Scores-and-Fixtures-Premier-League"
        print(url)
        df = sort_team_df(url)
        df['team'] = name
        df['year'] = year
        df.to_sql(f'team_individual_tbl', con=engine, if_exists='append') #append

print('done')

https://fbref.com/en/squads/18bb7c10/2023-2024/matchlogs/c9/schedule/Arsenal-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/8602292d/2023-2024/matchlogs/c9/schedule/Aston-Villa-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/4ba7cbea/2023-2024/matchlogs/c9/schedule/Bournemouth-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/cd051869/2023-2024/matchlogs/c9/schedule/Brentford-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/d07537b9/2023-2024/matchlogs/c9/schedule/Brighton-and-Hove-Albion-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/cff3d9bb/2023-2024/matchlogs/c9/schedule/Chelsea-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/47c64c55/2023-2024/matchlogs/c9/schedule/Crystal-Palace-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/d3fd31cc/2023-2024/matchlogs/c9/schedule/Everton-Scores-and-Fixtures-Premier-League
https://fbref.com/en/squads/fd962109/2023-2024/matchlogs/c9/schedule/F

In [47]:
# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")

# Create a new SQLite database (or connect to an existing one)
db_name_1 = os.path.join(new_directory, 'fbref_data_team_individual.db')
# db_name_2 = os.path.join(new_directory, 'fbref_data_players_latest.db')
# engine = create_engine(f'sqlite:///{db_name}')


conn = sqlite3.connect(db_name_1)
df_individual = pd.read_sql_query('SELECT * FROM team_individual_tbl', conn)
conn.close()

df_individual.tail(20)

Unnamed: 0,index,date,round,venue,result,gf,ga,opponent,xg,xga,possession,xg_roll,xga_roll,team,year
2911,8,2023-10-21 00:00:00.000000,Matchweek 9,Away,D,2.0,2.0,Nott'ham Forest,1.7,2.9,43,,,Luton-Town,2023-2024
2912,9,2023-10-29 00:00:00.000000,Matchweek 10,Away,L,1.0,3.0,Aston Villa,0.5,2.3,30,1.17,1.99,Luton-Town,2023-2024
2913,10,2023-11-05 00:00:00.000000,Matchweek 11,Home,D,1.0,1.0,Liverpool,0.8,2.6,27,1.1,1.85,Luton-Town,2023-2024
2914,11,2023-11-11 00:00:00.000000,Matchweek 12,Away,L,0.0,1.0,Manchester Utd,0.4,2.2,36,1.1,1.85,Luton-Town,2023-2024
2915,12,2023-11-25 00:00:00.000000,Matchweek 13,Home,W,2.0,1.0,Crystal Palace,0.9,1.6,39,1.05,1.91,Luton-Town,2023-2024
2916,13,2023-12-02 00:00:00.000000,Matchweek 14,Away,L,1.0,3.0,Brentford,0.2,3.1,33,0.96,2.12,Luton-Town,2023-2024
2917,0,2023-08-12 00:00:00.000000,Matchweek 1,Home,L,0.0,1.0,Crystal Palace,0.5,1.9,32,,,Sheffield-United,2023-2024
2918,1,2023-08-18 00:00:00.000000,Matchweek 2,Away,L,1.0,2.0,Nott'ham Forest,0.5,1.4,50,,,Sheffield-United,2023-2024
2919,2,2023-08-27 00:00:00.000000,Matchweek 3,Home,L,1.0,2.0,Manchester City,0.7,3.5,21,,,Sheffield-United,2023-2024
2920,3,2023-09-02 00:00:00.000000,Matchweek 4,Home,D,2.0,2.0,Everton,1.2,2.5,46,,,Sheffield-United,2023-2024


In [49]:
########################################################################################
# Test and clean above database
########################################################################################

# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")

# Create a new SQLite database (or connect to an existing one)
db_name_1 = os.path.join(new_directory, 'fbref_data_team_individual.db')
# db_name_2 = os.path.join(new_directory, 'fbref_data_players_latest.db')
# engine = create_engine(f'sqlite:///{db_name}')


conn = sqlite3.connect(db_name_1)
df_individual = pd.read_sql_query('SELECT * FROM team_individual_tbl', conn)
conn.close()

# #Get team names
# conn = sqlite3.connect(db_name_2)
# df = pd.read_sql_query(f'SELECT * FROM general', conn)
# conn.close()
# teams = df['Squad'].unique()	

df_individual = df_individual.drop_duplicates(subset=['date','round','venue','result','gf','ga', 'opponent', 'team', 'year'])
df_individual = df_individual.reset_index(drop=True)

team_mapping = {
    'Wolverhampton-Wanderers': 'Wolves',
    'Aston-Villa': 'Aston Villa',
    'Brighton-and-Hove-Albion' : 'Brighton',
    'Crystal-Palace' : 'Crystal Palace',
    'Manchester-City' : 'Manchester City',
    'Manchester-United' : 'Manchester Utd',
    'Newcastle-United' : 'Newcastle Utd',
    'Nottingham-Forest' : "Nott'ham Forest",
    'Tottenham-Hotspur' : 'Tottenham',  
    'West-Ham-United' : 'West Ham',
    'Luton-Town' : 'Luton Town',
    'Sheffield-United' : 'Sheffield Utd'
                } 
df_individual['team'] = df_individual['team'].replace(team_mapping)
# df_individual['team'] = df_individual['team'].apply(lambda x: replace_with_fuzzy_match(x, teams))
# df_individual['team'] = df_individual['team'].replace('Wolves','temp_Wolves')
# df_individual['team'] = df_individual['team'].replace('West Ham','Wolves')
# df_individual['team'] = df_individual['team'].replace('temp_Wolves','Wolves')

df_individual = df_individual.drop_duplicates(subset=['date','round','venue','result','gf','ga', 'opponent', 'team', 'year'])
df_individual = df_individual.reset_index(drop=True)

df_individual['date'] = pd.to_datetime(df_individual['date'])
df_individual = df_individual.sort_values(by='date')

conn = sqlite3.connect(db_name_1)
df_individual.to_sql('team_individual_tbl', conn, if_exists='replace', index=False)
conn.close()

df_individual[df_individual['team']=="Wolves"]

Unnamed: 0,index,date,round,venue,result,gf,ga,opponent,xg,xga,possession,xg_roll,xga_roll,team,year
5,0,2020-09-12,Matchweek 1,Home,L,0.0,2.0,Newcastle Utd,1.0,1.6,58,,,Wolves,2020-2021
19,1,2020-09-19,Matchweek 2,Away,L,1.0,2.0,Arsenal,2.0,1.1,38,,,Wolves,2020-2021
46,2,2020-09-27,Matchweek 3,Home,W,4.0,0.0,Wolves,2.4,0.5,37,,,Wolves,2020-2021
66,3,2020-10-04,Matchweek 4,Away,W,3.0,0.0,Leicester City,2.0,0.6,31,,,Wolves,2020-2021
87,4,2020-10-18,Matchweek 5,Away,D,3.0,3.0,Tottenham,1.9,1.7,50,,,Wolves,2020-2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578,9,2023-10-28,Matchweek 10,Home,D,2.0,2.0,Newcastle Utd,0.9,2.2,43,1.32,1.77,Wolves,2023-2024
2601,10,2023-11-04,Matchweek 11,Away,L,1.0,2.0,Sheffield Utd,1.2,1.0,61,1.22,1.65,Wolves,2023-2024
2617,11,2023-11-11,Matchweek 12,Home,W,2.0,1.0,Tottenham,2.1,0.7,42,1.22,1.5,Wolves,2023-2024
2650,12,2023-11-27,Matchweek 13,Away,L,2.0,3.0,Fulham,1.4,2.9,40,1.25,1.66,Wolves,2023-2024
