In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random

In [2]:
def nonblank_lines(filename):
    with open(filename) as f:
        stripped_lines = [line.strip() for line in f]
        return [line for line in stripped_lines if line]
    
def load_proxies_from_file(filename, shuffle=True):
    proxies = nonblank_lines(filename)

    if len(proxies) >0 :
        proxy = random.choice(proxies)
        if proxy:
            (IPv4, Port, username, password) = proxy.split(':')
            ip = IPv4 + ':' + Port
            new_proxies = {
                "http": "http://" + username + ":" + password + "@" + ip,
                "https": "http://" + username + ":" + password + "@" + ip,
            }
        else:
            new_proxies = proxy
    else:
        proxy = "Local Host"
        new_proxies = None
    return new_proxies,proxy

proxy_file = 'proxies.txt'
new_proxies,proxy = load_proxies_from_file(proxy_file, shuffle=True)

In [3]:
session = requests.session()
url = "https://fbref.com/en/comps/8/schedule/Champions-League-Scores-and-Fixtures"
headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
}
response = session.get(url,headers=headers,proxies=new_proxies)
soup = BeautifulSoup(response.text, "html.parser")
games_table = soup.find('table', {'id': 'sched_2023-2024_8_2'})
if games_table:
    match_links = games_table.find_all('a', href=True)
    match_report_urls = list(set(
        "https://fbref.com" + link['href'] for link in match_links 
        if link['href'].startswith("/en/matches/") and len(link['href'].split('/')) == 5
    ))

In [4]:
def extract_team_names(title):
    pattern = re.compile(r"(.+? vs\. .+?) Match Report")
    match = pattern.search(title)
    if match:
        return match.groups()
    else:
        return None

def scrape_date(table):
    title_data = soup_data.title.string
    team_name = extract_team_names(title_data)
    team_names = team_name[0]
    caption = table.find('caption').get_text()
    player_team = re.sub(r' Player Stats Table', '', caption)
    header_rows = table.thead.find_all('tr')
    headers = [th.getText().strip() for th in header_rows[1].find_all('th')]
    headers.insert(0, "Team")
    headers.insert(0, "Game")
    #print(headers)
    # Get rows from the table's 'tbody'
    rows = table.tbody.find_all('tr')

    # Create a list to hold data of the current table
    table_data = []

    # Iterate over each row
    for row in rows:
        # Get all cells from the row
        cells = row.find_all(['th', 'td'])
        # Get the text from each cell and strip it
        current_row = [cell.getText().strip() for cell in cells]
        # Add the current row to the table data list
        current_row.insert(0,team_names)
        current_row.insert(1,player_team)
        table_data.append(current_row)

    # Combine headers and data
    data_dict = {'headers': headers, 'data': table_data}
    return data_dict

In [5]:
all_matches_data = []
for url in match_report_urls:
    success = False
    while not success:
        try:
            new_proxies,proxy = load_proxies_from_file(proxy_file, shuffle=True)
            print(url)
            headers = {
                    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
                    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
                }
            response_data = session.get(url, headers=headers,proxies=new_proxies)
            soup_data = BeautifulSoup(response_data.text, "html.parser")
            team_1_table = soup_data.find_all('table', {'class': 'stats_table sortable'})[0]
            team_2_table = soup_data.find_all('table', {'class': 'stats_table sortable'})[6]
            team_1_data = scrape_date(team_1_table)
            team_2_data = scrape_date(team_2_table)
            df_team_1 = pd.DataFrame(team_1_data['data'], columns=team_1_data['headers'])
            df_team_2 = pd.DataFrame(team_2_data['data'], columns=team_2_data['headers'])
            df_combined = pd.concat([df_team_1, df_team_2], ignore_index=True)
            all_matches_data.append(df_combined)
            success = True
        except IndexError:
            print(f"IndexError for URL {url}. Retrying...")
big_dataframe = pd.concat(all_matches_data, ignore_index=True)
big_dataframe.to_csv("championsleague.csv", sep=';', encoding='utf-8', index=False)

https://fbref.com/en/matches/d66f6ad1/Young-Boys-Manchester-City-October-25-2023-Champions-League
https://fbref.com/en/matches/4a4b95b7/FC-Copenhagen-Manchester-United-November-8-2023-Champions-League
https://fbref.com/en/matches/0c9b1473/Paris-Saint-Germain-Dortmund-September-19-2023-Champions-League
https://fbref.com/en/matches/c8120a02/Lazio-Feyenoord-November-7-2023-Champions-League
https://fbref.com/en/matches/92fc6a26/Feyenoord-Celtic-September-19-2023-Champions-League
https://fbref.com/en/matches/17d16c49/PSV-Eindhoven-Lens-November-8-2023-Champions-League
https://fbref.com/en/matches/d916f357/Young-Boys-RB-Leipzig-September-19-2023-Champions-League
https://fbref.com/en/matches/9544bc7f/Milan-Paris-Saint-Germain-November-7-2023-Champions-League
https://fbref.com/en/matches/2d34d54a/Benfica-Real-Sociedad-October-24-2023-Champions-League
https://fbref.com/en/matches/8c29e4f7/Bayern-Munich-Manchester-United-September-20-2023-Champions-League
https://fbref.com/en/matches/b30ab716/Un

In [6]:
big_dataframe.shape
#len(match_report_urls)

(1968, 33)