# Scrapping

### Scrapping Barcelona Matches 2023/2024 from fbref

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [3]:
def scrape_barcelona_matches():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
    }
    #  fbref site url
    url = 'https://fbref.com/en/squads/206d90db/2023-2024/matchlogs/all_comps/schedule/Barcelona-Scores-and-Fixtures-All-Competitions'

    try:
        time.sleep(3)
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table', {'id': 'matchlogs_for'})

            if table:
                matches_data = []
                rows = table.find('tbody').find_all('tr')

                for row in rows:
                    match_data = {
                        'Match_date': row.find('th', {'data-stat': 'date'}).text.strip(),
                        'Time': row.find('td', {'data-stat': 'start_time'}).text.split('(')[0].strip(),
                        'Competition': row.find('td', {'data-stat': 'comp'}).text.strip(),
                        'Round': row.find('td', {'data-stat': 'round'}).text.strip(),
                        'Day': row.find('td', {'data-stat': 'dayofweek'}).text.strip(),
                        'Venue': row.find('td', {'data-stat': 'venue'}).text.strip(),
                        'Result': row.find('td', {'data-stat': 'result'}).text.strip(),
                        'Goals_for': row.find('td', {'data-stat': 'goals_for'}).text.strip(),
                        'Goals_against': row.find('td', {'data-stat': 'goals_against'}).text.strip(),
                        'Opponent': row.find('td', {'data-stat': 'opponent'}).text.strip(),
                        'Xg': row.find('td', {'data-stat': 'xg_for'}).text.strip(),
                        'Xga': row.find('td', {'data-stat': 'xg_against'}).text.strip(),
                        'Possession': row.find('td', {'data-stat': 'possession'}).text.strip(),
                        'Attendance': row.find('td', {'data-stat': 'attendance'}).text.strip(),
                        'Captain': row.find('td', {'data-stat': 'captain'}).text.strip(),
                        'Formation': row.find('td', {'data-stat': 'formation'}).text.strip(),
                        'Opponent_formation': row.find('td', {'data-stat': 'opp_formation'}).text.strip(),
                        'Referee': row.find('td', {'data-stat': 'referee'}).text.strip()
                    }
                    matches_data.append(match_data)

                df = pd.DataFrame(matches_data)
                return df

            else:
                print("Could not find the matches table on the page")
                return None

        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None


In [5]:
df_match_barcelone = scrape_barcelona_matches()
df_match_barcelone.head()

Unnamed: 0,Match_date,Time,Competition,Round,Day,Venue,Result,Goals_for,Goals_against,Opponent,Xg,Xga,Possession,Attendance,Captain,Formation,Opponent_formation,Referee
0,2023-08-13,21:30,La Liga,Matchweek 1,Sun,Away,D,0,0,Getafe,1.4,0.6,74,13410,Marc-André ter Stegen,4-2-3-1,4-5-1,César Soto
1,2023-08-20,19:30,La Liga,Matchweek 2,Sun,Home,W,2,0,Cádiz,3.4,1.1,74,39603,Marc-André ter Stegen,3-4-3,4-4-2,Alejandro Muñíz
2,2023-08-27,17:30,La Liga,Matchweek 3,Sun,Away,W,4,3,Villarreal,3.1,1.9,60,21679,Sergi Roberto,4-3-3,4-4-2,Alejandro Hernández
3,2023-09-03,21:00,La Liga,Matchweek 4,Sun,Away,W,2,1,Osasuna,2.0,0.8,69,21966,Sergi Roberto,4-3-3,4-1-4-1,Miguel Ángel Ortiz Arias
4,2023-09-16,21:00,La Liga,Matchweek 5,Sat,Home,W,5,0,Betis,2.4,0.8,65,45055,Marc-André ter Stegen,4-3-3,4-2-3-1,José Sánchez


### Transforming date format from YYYY-mm-dd to dd/mm/YYYY

In [6]:
df_match_barcelone['Match_date'] = pd.to_datetime(df_match_barcelone['Match_date']).dt.strftime("%d/%m/%Y")

In [7]:
df_match_barcelone.head()

Unnamed: 0,Match_date,Time,Competition,Round,Day,Venue,Result,Goals_for,Goals_against,Opponent,Xg,Xga,Possession,Attendance,Captain,Formation,Opponent_formation,Referee
0,13/08/2023,21:30,La Liga,Matchweek 1,Sun,Away,D,0,0,Getafe,1.4,0.6,74,13410,Marc-André ter Stegen,4-2-3-1,4-5-1,César Soto
1,20/08/2023,19:30,La Liga,Matchweek 2,Sun,Home,W,2,0,Cádiz,3.4,1.1,74,39603,Marc-André ter Stegen,3-4-3,4-4-2,Alejandro Muñíz
2,27/08/2023,17:30,La Liga,Matchweek 3,Sun,Away,W,4,3,Villarreal,3.1,1.9,60,21679,Sergi Roberto,4-3-3,4-4-2,Alejandro Hernández
3,03/09/2023,21:00,La Liga,Matchweek 4,Sun,Away,W,2,1,Osasuna,2.0,0.8,69,21966,Sergi Roberto,4-3-3,4-1-4-1,Miguel Ángel Ortiz Arias
4,16/09/2023,21:00,La Liga,Matchweek 5,Sat,Home,W,5,0,Betis,2.4,0.8,65,45055,Marc-André ter Stegen,4-3-3,4-2-3-1,José Sánchez


### Spliting captain column to captain_firstname and captain_lastname columns

In [9]:
df_match_barcelone['Captain_firstname'] = df_match_barcelone['Captain'].str.split(' ').str[0]
df_match_barcelone['Captain_lastname'] = df_match_barcelone['Captain'].apply(lambda x: x.split(' ').str[0] if len(x.split(' ')) == 1 else " ".join(x.split(' ')[1:]))
del df_match_barcelone["Captain"]

In [10]:
df_match_barcelone.head()

Unnamed: 0,Match_date,Time,Competition,Round,Day,Venue,Result,Goals_for,Goals_against,Opponent,Xg,Xga,Possession,Attendance,Formation,Opponent_formation,Referee,Captain_firstname,Captain_lastname
0,13/08/2023,21:30,La Liga,Matchweek 1,Sun,Away,D,0,0,Getafe,1.4,0.6,74,13410,4-2-3-1,4-5-1,César Soto,Marc-André,ter Stegen
1,20/08/2023,19:30,La Liga,Matchweek 2,Sun,Home,W,2,0,Cádiz,3.4,1.1,74,39603,3-4-3,4-4-2,Alejandro Muñíz,Marc-André,ter Stegen
2,27/08/2023,17:30,La Liga,Matchweek 3,Sun,Away,W,4,3,Villarreal,3.1,1.9,60,21679,4-3-3,4-4-2,Alejandro Hernández,Sergi,Roberto
3,03/09/2023,21:00,La Liga,Matchweek 4,Sun,Away,W,2,1,Osasuna,2.0,0.8,69,21966,4-3-3,4-1-4-1,Miguel Ángel Ortiz Arias,Sergi,Roberto
4,16/09/2023,21:00,La Liga,Matchweek 5,Sat,Home,W,5,0,Betis,2.4,0.8,65,45055,4-3-3,4-2-3-1,José Sánchez,Marc-André,ter Stegen


### Organising columns



In [12]:
new_order = [
    "Match_date", "Time", "Opponent", "Competition", "Round", "Day", "Venue",
    "Result", "Goals_for", "Goals_against", "Captain_firstname", "Captain_lastname",
    "Possession", "Formation", "Opponent_formation", "Attendance", "Referee"
]

df_match_barcelone = df_match_barcelone[new_order]
df_match_barcelone.head()

Unnamed: 0,Match_date,Time,Opponent,Competition,Round,Day,Venue,Result,Goals_for,Goals_against,Captain_firstname,Captain_lastname,Possession,Formation,Opponent_formation,Attendance,Referee
0,13/08/2023,21:30,Getafe,La Liga,Matchweek 1,Sun,Away,D,0,0,Marc-André,ter Stegen,74,4-2-3-1,4-5-1,13410,César Soto
1,20/08/2023,19:30,Cádiz,La Liga,Matchweek 2,Sun,Home,W,2,0,Marc-André,ter Stegen,74,3-4-3,4-4-2,39603,Alejandro Muñíz
2,27/08/2023,17:30,Villarreal,La Liga,Matchweek 3,Sun,Away,W,4,3,Sergi,Roberto,60,4-3-3,4-4-2,21679,Alejandro Hernández
3,03/09/2023,21:00,Osasuna,La Liga,Matchweek 4,Sun,Away,W,2,1,Sergi,Roberto,69,4-3-3,4-1-4-1,21966,Miguel Ángel Ortiz Arias
4,16/09/2023,21:00,Betis,La Liga,Matchweek 5,Sat,Home,W,5,0,Marc-André,ter Stegen,65,4-3-3,4-2-3-1,45055,José Sánchez


In [13]:
df_match_barcelone.to_csv("match_barcelone.csv", index=False)