# Scrapping

## scrapping players of Barcelona 2024 from transfermarket

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
}

In [3]:
url_barca = 'https://www.transfermarkt.fr/fc-barcelone/kader/verein/131/plus/0/galerie/0?saison_id=2023'
# Send a request to the URL
response_barca = requests.get(url_barca, headers=headers)

In [8]:
data_barca = []
if response_barca.status_code == 200:
    soup = BeautifulSoup(response_barca.content, 'html.parser')

    # Find all players listed in the table rows with classes 'odd' and 'even'
    joueurs = soup.find_all('tr', class_='odd') + soup.find_all('tr', class_='even')

    for joueur in joueurs:
        # Extract player name
        name = joueur.find('td', class_='hauptlink').find('a').text.strip()

        # Extract player ID
        id_player_element = joueur.find('a', class_='spielprofil_tooltip')
        if id_player_element and 'href' in id_player_element.attrs:
            id_player = id_player_element['href'].split('/')[-1]
        else:
            id_player_element = joueur.find('a', href=lambda x: x and '/profil/spieler/' in x)
            id_player = id_player_element['href'].split('/')[-1] if id_player_element else "ID non trouvé"

        # Extract player age and simplify it to just the number in parentheses
        age_text = joueur.find_all('td', class_='zentriert')[1].text.strip()
        age = int(age_text) if age_text else "Age not found"

        # Extract player market value and simplify to just the numeric part
        value_text = joueur.find('td', class_='rechts hauptlink').text.strip()
        value = value_text.replace(",", ".")  # Remplacer les virgules par des points
        if "mio" in value:
            value_text = float(value.split()[0]) * 1_000_000  # Convertir mio. en millions
        elif "K" in value:
            value_text = float(value.split()[0]) * 1_000  # Convertir K en milliers
        elif value.strip() == "-":
            value_text = None  # Si la valeur est "-", retourner None
        else:
            value_text = float(value)  # Retourner les autres cas tels quels

        # Extract player position from 'title' attribute or nested table
        position = 'Position not found'

        # Check for position in `title` attribute of `td` element
        title_position_td = joueur.find('td', class_='zentriert', title=True)
        if title_position_td and 'title' in title_position_td.attrs:
            position = title_position_td['title']
        else:
            # Fallback to nested table in 'posrela' cell
            posrela_cell = joueur.find('td', class_='posrela')
            if posrela_cell:
                nested_table = posrela_cell.find('table', class_='inline-table')
                if nested_table:
                    position_cell = nested_table.find('td')
                    if position_cell:
                        position = position_cell.text.strip()

        # Extract player nationality by iterating over all zentriert cells
        nationality = 'Nationality not found'
        for cell in joueur.find_all('td', class_='zentriert'):
            nationality_img = cell.find_all('img', class_='flaggenrahmen')
            if nationality_img:
                if len(nationality_img) == 1:
                    nationality = nationality_img[0]['alt']
                    second_nationality = None
                else:
                    nationality = nationality_img[0]['alt']
                    second_nationality = nationality_img[1]['alt']

        # Append player data to list
        data_barca.append({
            'ID': id_player,
            'Name': name,
            'Age': age,
            'Market_value': value_text,
            'Position': position,
            'Nationality': nationality,
            'Sec_Nationality_Origine': second_nationality
        })

# Convert the data to a DataFrame
df_barca = pd.DataFrame(data_barca)

In [9]:
df_barca

Unnamed: 0,ID,Name,Age,Market_value,Position,Nationality,Sec_Nationality_Origine
0,74857,Marc-André ter Stegen,32,28000000.0,Gardien,Allemagne,
1,709955,Ander Astralaga,20,1000000.0,Gardien,Espagne,
2,938145,Áron Yaakobishvili,18,,Gardien,Hongrie,Géorgie
3,196948,Andreas Christensen,28,40000000.0,Défense,Danemark,
4,466794,Eric García,23,20000000.0,Défense,Espagne,
5,158863,Iñigo Martínez,33,5000000.0,Défense,Espagne,
6,112515,Marcos Alonso,33,2000000.0,Défense,Espagne,
7,182712,João Cancelo,30,25000000.0,Défense,Portugal,
8,636695,Marc Casadó,20,2500000.0,Milieu,Espagne,
9,937956,Pau Prim,18,500000.0,Milieu,Espagne,


### Spliting player name

In [10]:
df_barca['Firstname'] = df_barca['Name'].str.split(' ').str[0]
df_barca['Lastname'] = df_barca['Name'].apply(lambda x: x.split(' ')[0] if len(x.split(' ')) == 1 else " ".join(x.split(' ')[1:]))
del df_barca["Name"]

In [11]:
df_barca.head()

Unnamed: 0,ID,Age,Market_value,Position,Nationality,Sec_Nationality_Origine,Firstname,Lastname
0,74857,32,28000000.0,Gardien,Allemagne,,Marc-André,ter Stegen
1,709955,20,1000000.0,Gardien,Espagne,,Ander,Astralaga
2,938145,18,,Gardien,Hongrie,Géorgie,Áron,Yaakobishvili
3,196948,28,40000000.0,Défense,Danemark,,Andreas,Christensen
4,466794,23,20000000.0,Défense,Espagne,,Eric,García


### Reorginising columns

In [13]:
new_ordre = ["ID", "Firstname", "Lastname", "Age", "Market_value",
                "Position", "Nationality", "Sec_Nationality_Origine"]

df_barca = df_barca[new_ordre]

In [14]:
df_barca.head()

Unnamed: 0,ID,Firstname,Lastname,Age,Market_value,Position,Nationality,Sec_Nationality_Origine
0,74857,Marc-André,ter Stegen,32,28000000.0,Gardien,Allemagne,
1,709955,Ander,Astralaga,20,1000000.0,Gardien,Espagne,
2,938145,Áron,Yaakobishvili,18,,Gardien,Hongrie,Géorgie
3,196948,Andreas,Christensen,28,40000000.0,Défense,Danemark,
4,466794,Eric,García,23,20000000.0,Défense,Espagne,


In [15]:
df_barca.to_csv("players_barca.csv", index=False)