In [26]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random

In [56]:
def nonblank_lines(filename):
    with open(filename) as f:
        stripped_lines = [line.strip() for line in f]
        return [line for line in stripped_lines if line]
    
def load_proxies_from_file(filename, shuffle=True):
    proxies = nonblank_lines(filename)

    if len(proxies) >0 :
        proxy = random.choice(proxies)
        if proxy:
            (IPv4, Port, username, password) = proxy.split(':')
            ip = IPv4 + ':' + Port
            new_proxies = {
                "http": "http://" + username + ":" + password + "@" + ip,
                "https": "http://" + username + ":" + password + "@" + ip,
            }
        else:
            new_proxies = proxy
    else:
        proxy = "Local Host"
        new_proxies = None
    return new_proxies,proxy

proxy_file = 'proxies.txt'
new_proxies,proxy = load_proxies_from_file(proxy_file, shuffle=True)

In [28]:
session = requests.session()
comp = "c9"
url = "https://fbref.com/en/squads/822bd0ba/Liverpool-Stats"
headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
}
response = session.get(url,headers=headers,proxies=new_proxies)
soup = BeautifulSoup(response.text, "html.parser")
squad_list_table = soup.find('table', {'class': 'stats_table sortable min_width'})
player_links = squad_list_table.find_all('a', href=True)
squad_player_links = [link.get('href') for link in player_links if link.get('href').count('/') == 4 and link.get('href').startswith("/en/players/")]
formatted_urls = [
    f"https://fbref.com/en/players/{link.split('/')[3]}/matchlogs/2023-2024/{comp}/{'-'.join(link.split('/')[-1].split('-')[:-1])}-Match-Logs"
    for link in squad_player_links
]

In [35]:
url_player = 'https://fbref.com/en/players/934e1968/matchlogs/2023-2024/c9/Dominik-Szoboszlai-Match-Logs' #Any Random outfield player to set the headers
headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
}
response_player = session.get(url_player, headers=headers,proxies=new_proxies)
soup_player = BeautifulSoup(response_player.text, "html.parser")
date_header_player = soup_player.find("th", text="Date")
header_row_player = date_header_player.find_parent("tr")
headers_player = [th.get_text(strip=True) for th in header_row_player.find_all("th")]
headers_player.insert(0, "Player Name")

In [38]:
url_keeper = 'https://fbref.com/en/players/7a2e46a8/matchlogs/2023-2024/c9/Alisson-Match-Logs' #Any Random Goalkeeper to set the headers
headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
}
response_keeper = session.get(url_keeper, headers=headers,proxies=new_proxies)
soup_keeper = BeautifulSoup(response_keeper.text, "html.parser")
date_header_keeper = soup_keeper.find("th", text="Date")
header_row_keeper = date_header_keeper.find_parent("tr")
headers_keeper = [th.get_text(strip=True) for th in header_row_keeper.find_all("th")]
headers_keeper.insert(0, "Player Name")

['Player Name', 'Date', 'Day', 'Round', 'Venue', 'Result', 'Squad', 'Opponent', 'Start', 'Pos', 'Min', 'SoTA', 'GA', 'Saves', 'Save%', 'CS', 'PSxG', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Cmp', 'Att', 'Cmp%', 'Att', 'Thr', 'Launch%', 'AvgLen', 'Att', 'Launch%', 'AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', 'AvgDist', 'Match Report']


In [58]:
player_data = []
keeper_data= []
for url in formatted_urls:
    headers = {
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
    }
    response_data = session.get(url, headers=headers,proxies=new_proxies)
    soup_data = BeautifulSoup(response_data.text, "html.parser")
    title_data = soup_data.title.string
    pattern = r"2023-2024 (Premier League|Serie A|La Liga|Ligue 1|Bundesliga) Match Logs( \(Goalkeeping\))? \| FBref\.com"
    player_name = re.sub(pattern, "", title_data).strip()
    date_header = soup_data.find("th", text="Date") 
    if date_header == None: #Sometimes the player dont have date, nvr played before.
        continue
    header_rows = date_header.find_parent("tr")
    data_rows = header_rows.find_all_next("tr")
    for row in data_rows[0:-1]: #We do -1 here is because to elimintate the total statistics
        if (row.get('class')) == None: #Remove space & Matches not played
            cells = row.find_all(['th', 'td'])
            row_data = [cell.get_text(strip=True) for cell in cells]
            if "Goalkeeping" in title_data: #If it is a goalkeeper
                row_data.insert(0,player_name)
                keeper_data.append(row_data)
            else: #If not goalkeeper
                row_data.insert(0,player_name)
                player_data.append(row_data)
df_player = pd.DataFrame(player_data,columns=headers_player)
df_keeper = pd.DataFrame(keeper_data,columns=headers_keeper)
df_player.to_csv("Liverpoolplayer.csv", sep=';', encoding='utf-8', index=False)
df_keeper.to_csv("Liverpoolkeeper.csv", sep=';', encoding='utf-8', index=False)

Number of headers: 37
Number of headers: 37
