In [10]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import time
import json
import csv

In [11]:
def get_table_html(session, season, page):
    url_season = f'https://www.transfermarkt.com/transfers/saisontransfers/statistik/top/ajax/yw0/saison_id/{season}/transferfenster/alle/land_id//ausrichtung//spielerposition_id//altersklasse//leihe//plus/13/galerie/0/page/{page}'
    
    response = session.get(url_season)
    
    return BeautifulSoup(response.text)

In [12]:
def marktwert_numerical(marktwert):
    numerical_part = ''.join(filter(str.isdigit, marktwert))
    if numerical_part:
        return float(numerical_part)/100
    else:
        return 0

In [26]:
def ablöse_wert(wert,details):
    numerical_part = ''.join(filter(str.isdigit, wert))
    wert = wert.lower()
    if numerical_part:
        details["Transfer Typ"] = "Dauerhaft"
        return float(numerical_part)/100
    else:
        if 'loan' in wert:
            details["Transfer Typ"] = "Leihe"
            return None
        elif wert == 'free transfer':
            details["Transfer Typ"] = "Ablösefrei"
            return None
        else:
            details["Transfer Typ"] = "Unbekannt"
            return None

In [14]:
def scrape_season_page(session, season, page):
    season_soup = get_table_html(session, season, page)
    rows_odd = season_soup.find_all(class_ = "odd")
    rows_even = season_soup.find_all(class_ = "even")
    rows = []
    transfer_details = {}
    data = []
    
    for oddrow in rows_odd:
        rows.append(oddrow)
    for evenrow in rows_even:
        rows.append(evenrow)
    
    for row in rows:
        transfer_details["Saison"] = str(season)[-2:]
        transfer_details["Spieler"] = row.find_all("td")[1].find_all("tr")[0].text.strip()
        transfer_details["Position"] = row.find_all("td")[1].find_all("tr")[1].text.strip()
        transfer_details["Alter"] = row.find_all("td")[5].text.strip()
        transfer_details["Marktwert"] = marktwert_numerical(row.find_all("td")[6].text.strip())
        transfer_details["Nationalität"] = row.find_all("td")[7].find_all("img")[0]["title"].strip()
        transfer_details["Abgebender Verein"] = row.find_all("td")[8].find_all("tr")[0].find_all("td")[1].text.strip()
        # Falls kein abgebender Verein vorhanden
        try:
            transfer_details["Abgebende Liga"] = row.find_all("td")[8].find_all("tr")[1].find_all("a")[0].text.strip()
        except:
            transfer_details["Abgebende Liga"] = "Without Club"
        transfer_details["Aufnehmender Verein"] = row.find_all("td")[12].find_all("tr")[0].find_all("td")[1].text.strip()
        try:
            # Schlägt fehl, wenn Liga nicht mit Transfermarkt-Link
            transfer_details["Aufnehmende Liga"] = row.find_all("td")[12].find_all("tr")[1].find_all("a")[0].text.strip()
        except:
            # Dann nehme Bezeichnung der Flagge (wie Transfermarkt es auch macht)
            try:
                # Möglicher Fall: Spieler wurde gebannt, daher auch keine Flagge vorhanden:
                transfer_details["Aufnehmende Liga"] = row.find_all("td")[12].find_all("tr")[1].find_all("img")[0]["title"].strip()
            except:
                # Scrape "Bann"-Text
                transfer_details["Aufnehmende Liga"] = row.find_all("td")[12].find_all("tr")[0].find_all("td")[1].text.strip()
                
        transfer_details["Ablöse"] = ablöse_wert(row.find_all("td")[16].text.strip(),transfer_details)
        
        data.append(transfer_details.copy())
        
    return data

In [31]:
def scrape_transfers(scrape_seasons):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
    }

    session = requests.session()
    session.headers.update(headers)

    season_dataset = []
    
    for season in scrape_seasons:   
        print('scraping:', season, "page:")
        
        for page in range(1,81):
            
            print(page, end="\r", flush=True)
            
            for transfer in scrape_season_page(session, season, page):
                season_dataset.append(transfer)
        print("... done ...")
        
    filename = f'C:\\Users\\soere\\Bachelorarbeit\\datasets\\transfers.csv'
    
    with open(filename, 'w', newline='', encoding='utf8') as csvfile:
        fieldnames = ['Saison','Spieler','Position','Alter','Marktwert','Nationalität','Abgebender Verein','Abgebende Liga','Aufnehmender Verein','Aufnehmende Liga','Ablöse','Transfer Typ']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(season_dataset)
            
    return None

In [32]:
scrape_transfers([2018,2019,2020,2021,2022,2023,2024])

scraping: 2018 page:
... done ...
scraping: 2019 page:
... done ...
scraping: 2020 page:
... done ...
scraping: 2021 page:
... done ...
scraping: 2022 page:
... done ...
scraping: 2023 page:
... done ...
scraping: 2024 page:
... done ...
