# Obtener Data de "fbref"

In [104]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import re

## Borrador

In [36]:
url = 'https://fbref.com/en/equipos/87ffd947/2020/Estadisticas-de-Melgar'

In [37]:
headers = requests.utils.default_headers()
headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0',
})
r = requests.get(url, headers=headers)

In [38]:
r.status_code

200

In [39]:
soup = BeautifulSoup(r.text, 'html.parser')

In [40]:
print(soup.title.text)

2020 Melgar Stats, Liga 1 | FBref.com


In [41]:
# Encuentra el primer tr (fila de la tabla - nombre de las columnas) y obten el th
cols_name = [header.text.strip() for header in soup.find(id='matchlogs_for').find('thead').find_all('th')]

In [42]:
cols_name

['Date',
 'Time',
 'Comp',
 'Round',
 'Day',
 'Venue',
 'Result',
 'GF',
 'GA',
 'Opponent',
 'Poss',
 'Attendance',
 'Captain',
 'Formation',
 'Opp Formation',
 'Referee',
 'Match Report',
 'Notes']

## Obtener data

In [105]:
def get_matches(urls, team):
    # Obtenemos el nombre de la columnas
    df = pd.DataFrame(columns=cols_name)
    
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0',
    })

    for url in tqdm(urls):
        r = requests.get(url, headers=headers)
        if r.ok:
            soup = BeautifulSoup(r.text, 'html.parser')
            title = soup.title.text
            match = re.search(r'\d+ (.+?) Stats', title)
            if match:
                title = match.group(1)
            for row in soup.find(id='matchlogs_for').find('tbody').find_all('tr'):
                # Obtener la fecha (está en una etiqueta th) y la data de la tabla (etiqueta td)
                table_data = [row.find('th').text.strip()] + [d.text.strip() for d in row.find_all('td')]
                if table_data[0] != '':
                    df.loc[len(df)] = table_data
            file_name = f'data/{title} Stats.csv'
            df.to_csv(file_name)
        else:
            print(r.status_code)
        time.sleep(5)

### 01 Universitario

In [107]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/e4108102/{i}/Universitario-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/e4108102/2020/Universitario-Stats', 'https://fbref.com/en/squads/e4108102/2021/Universitario-Stats', 'https://fbref.com/en/squads/e4108102/2022/Universitario-Stats', 'https://fbref.com/en/squads/e4108102/2023/Universitario-Stats', 'https://fbref.com/en/squads/e4108102/2024/Universitario-Stats']


In [108]:
get_matches(urls, 'Universitario')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.52s/it]


### 02 Melgar

In [109]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/87ffd947/{i}/Melgar-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/87ffd947/2020/Melgar-Stats', 'https://fbref.com/en/squads/87ffd947/2021/Melgar-Stats', 'https://fbref.com/en/squads/87ffd947/2022/Melgar-Stats', 'https://fbref.com/en/squads/87ffd947/2023/Melgar-Stats', 'https://fbref.com/en/squads/87ffd947/2024/Melgar-Stats']


In [110]:
get_matches(urls, 'Melgar')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.47s/it]


### 03 Garcilazo

In [111]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/4271353d/{i}/Deportivo-Garcilaso-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/4271353d/2020/Deportivo-Garcilaso-Stats', 'https://fbref.com/en/squads/4271353d/2021/Deportivo-Garcilaso-Stats', 'https://fbref.com/en/squads/4271353d/2022/Deportivo-Garcilaso-Stats', 'https://fbref.com/en/squads/4271353d/2023/Deportivo-Garcilaso-Stats', 'https://fbref.com/en/squads/4271353d/2024/Deportivo-Garcilaso-Stats']


In [112]:
get_matches(urls, 'Garcilazo')

  0%|                                                        | 0/5 [00:00<?, ?it/s]

500


 20%|█████████▌                                      | 1/5 [00:05<00:22,  5.54s/it]

500


 40%|███████████████████▏                            | 2/5 [00:11<00:16,  5.50s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.49s/it]


### 04 Alianza

In [113]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/cdbccdc1/{i}/Alianza-Lima-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/cdbccdc1/2020/Alianza-Lima-Stats', 'https://fbref.com/en/squads/cdbccdc1/2021/Alianza-Lima-Stats', 'https://fbref.com/en/squads/cdbccdc1/2022/Alianza-Lima-Stats', 'https://fbref.com/en/squads/cdbccdc1/2023/Alianza-Lima-Stats', 'https://fbref.com/en/squads/cdbccdc1/2024/Alianza-Lima-Stats']


In [114]:
get_matches(urls, 'Alianza')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.52s/it]


### 05 ADT Tarma

In [115]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/78987e91/{i}/ADT-Tarma-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/78987e91/2020/ADT-Tarma-Stats', 'https://fbref.com/en/squads/78987e91/2021/ADT-Tarma-Stats', 'https://fbref.com/en/squads/78987e91/2022/ADT-Tarma-Stats', 'https://fbref.com/en/squads/78987e91/2023/ADT-Tarma-Stats', 'https://fbref.com/en/squads/78987e91/2024/ADT-Tarma-Stats']


In [116]:
get_matches(urls, 'ADT Tarma')

  0%|                                                        | 0/5 [00:00<?, ?it/s]

500


 20%|█████████▌                                      | 1/5 [00:05<00:21,  5.48s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.49s/it]


### 05 Sporting Cristal 

In [117]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/8917b8a9/{i}/Sporting-Cristal-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/8917b8a9/2020/Sporting-Cristal-Stats', 'https://fbref.com/en/squads/8917b8a9/2021/Sporting-Cristal-Stats', 'https://fbref.com/en/squads/8917b8a9/2022/Sporting-Cristal-Stats', 'https://fbref.com/en/squads/8917b8a9/2023/Sporting-Cristal-Stats', 'https://fbref.com/en/squads/8917b8a9/2024/Sporting-Cristal-Stats']


In [118]:
get_matches(urls, 'Sporting Cristal')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.46s/it]


### 07 Sport Huancayo

In [119]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/d7ba2e36/{i}/Sport-Huancayo-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/d7ba2e36/2020/Sport-Huancayo-Stats', 'https://fbref.com/en/squads/d7ba2e36/2021/Sport-Huancayo-Stats', 'https://fbref.com/en/squads/d7ba2e36/2022/Sport-Huancayo-Stats', 'https://fbref.com/en/squads/d7ba2e36/2023/Sport-Huancayo-Stats', 'https://fbref.com/en/squads/d7ba2e36/2024/Sport-Huancayo-Stats']


In [120]:
get_matches(urls, 'Sport Huancayo')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.50s/it]


### 08 Alianza Atlético

In [121]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/e71b53ba/{i}/Alianza-Atletico-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/e71b53ba/2020/Alianza-Atletico-Stats', 'https://fbref.com/en/squads/e71b53ba/2021/Alianza-Atletico-Stats', 'https://fbref.com/en/squads/e71b53ba/2022/Alianza-Atletico-Stats', 'https://fbref.com/en/squads/e71b53ba/2023/Alianza-Atletico-Stats', 'https://fbref.com/en/squads/e71b53ba/2024/Alianza-Atletico-Stats']


In [122]:
get_matches(urls, 'Alianza Atlético')

  0%|                                                        | 0/5 [00:00<?, ?it/s]

500


100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.49s/it]


### 09 Atlético Grau

In [123]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/d1077778/{i}/Atletico-Grau-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/d1077778/2020/Atletico-Grau-Stats', 'https://fbref.com/en/squads/d1077778/2021/Atletico-Grau-Stats', 'https://fbref.com/en/squads/d1077778/2022/Atletico-Grau-Stats', 'https://fbref.com/en/squads/d1077778/2023/Atletico-Grau-Stats', 'https://fbref.com/en/squads/d1077778/2024/Atletico-Grau-Stats']


In [124]:
get_matches(urls, 'Atlético Grau')

 20%|█████████▌                                      | 1/5 [00:06<00:25,  6.47s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.66s/it]


### 10 Cusco

In [125]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/d4f8af71/{i}/Cusco-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/d4f8af71/2020/Cusco-Stats', 'https://fbref.com/en/squads/d4f8af71/2021/Cusco-Stats', 'https://fbref.com/en/squads/d4f8af71/2022/Cusco-Stats', 'https://fbref.com/en/squads/d4f8af71/2023/Cusco-Stats', 'https://fbref.com/en/squads/d4f8af71/2024/Cusco-Stats']


In [126]:
get_matches(urls, 'Cusco')

 40%|███████████████████▏                            | 2/5 [00:10<00:16,  5.48s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.47s/it]


### 11 Sport Boys

In [127]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/0c2512a2/{i}/Sport-Boys-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/0c2512a2/2020/Sport-Boys-Stats', 'https://fbref.com/en/squads/0c2512a2/2021/Sport-Boys-Stats', 'https://fbref.com/en/squads/0c2512a2/2022/Sport-Boys-Stats', 'https://fbref.com/en/squads/0c2512a2/2023/Sport-Boys-Stats', 'https://fbref.com/en/squads/0c2512a2/2024/Sport-Boys-Stats']


In [128]:
get_matches(urls, 'Sport Boys')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.47s/it]


### 12 CDC Santa Rosa

In [129]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/56762359/{i}/CDC-Santa-Rosa-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/56762359/2020/CDC-Santa-Rosa-Stats', 'https://fbref.com/en/squads/56762359/2021/CDC-Santa-Rosa-Stats', 'https://fbref.com/en/squads/56762359/2022/CDC-Santa-Rosa-Stats', 'https://fbref.com/en/squads/56762359/2023/CDC-Santa-Rosa-Stats', 'https://fbref.com/en/squads/56762359/2024/CDC-Santa-Rosa-Stats']


In [130]:
get_matches(urls, 'CDC Santa Rosa')

  0%|                                                        | 0/5 [00:00<?, ?it/s]

500


 20%|█████████▌                                      | 1/5 [00:07<00:31,  7.92s/it]

500


 40%|███████████████████▏                            | 2/5 [00:13<00:19,  6.51s/it]

500


 60%|████████████████████████████▊                   | 3/5 [00:18<00:12,  6.05s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.97s/it]


### 13 Binacional

In [131]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/e5c4db74/{i}/Binacional-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/e5c4db74/2020/Binacional-Stats', 'https://fbref.com/en/squads/e5c4db74/2021/Binacional-Stats', 'https://fbref.com/en/squads/e5c4db74/2022/Binacional-Stats', 'https://fbref.com/en/squads/e5c4db74/2023/Binacional-Stats', 'https://fbref.com/en/squads/e5c4db74/2024/Binacional-Stats']


In [132]:
get_matches(urls, 'Binacional')

 80%|██████████████████████████████████████▍         | 4/5 [00:24<00:05,  5.86s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.95s/it]


### 14 Cienciano

In [133]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/adf57493/{i}/Cienciano-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/adf57493/2020/Cienciano-Stats', 'https://fbref.com/en/squads/adf57493/2021/Cienciano-Stats', 'https://fbref.com/en/squads/adf57493/2022/Cienciano-Stats', 'https://fbref.com/en/squads/adf57493/2023/Cienciano-Stats', 'https://fbref.com/en/squads/adf57493/2024/Cienciano-Stats']


In [134]:
get_matches(urls, 'Cienciano')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.45s/it]


### 15 Ayacucho

In [135]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/e6b8138d/{i}/Ayacucho-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/e6b8138d/2020/Ayacucho-Stats', 'https://fbref.com/en/squads/e6b8138d/2021/Ayacucho-Stats', 'https://fbref.com/en/squads/e6b8138d/2022/Ayacucho-Stats', 'https://fbref.com/en/squads/e6b8138d/2023/Ayacucho-Stats', 'https://fbref.com/en/squads/e6b8138d/2024/Ayacucho-Stats']


In [136]:
get_matches(urls, 'Ayacucho')

 60%|████████████████████████████▊                   | 3/5 [00:18<00:12,  6.10s/it]

500


 80%|██████████████████████████████████████▍         | 4/5 [00:24<00:05,  5.86s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.95s/it]


### 16 Comerciantes Unidos

In [137]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/59cf5fee/{i}/Comerciantes-Unidos-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/59cf5fee/2020/Comerciantes-Unidos-Stats', 'https://fbref.com/en/squads/59cf5fee/2021/Comerciantes-Unidos-Stats', 'https://fbref.com/en/squads/59cf5fee/2022/Comerciantes-Unidos-Stats', 'https://fbref.com/en/squads/59cf5fee/2023/Comerciantes-Unidos-Stats', 'https://fbref.com/en/squads/59cf5fee/2024/Comerciantes-Unidos-Stats']


In [138]:
get_matches(urls, 'Comerciantes Unidos')

  0%|                                                        | 0/5 [00:00<?, ?it/s]

500


 20%|█████████▌                                      | 1/5 [00:05<00:22,  5.51s/it]

500


 40%|███████████████████▏                            | 2/5 [00:11<00:16,  5.52s/it]

500


 60%|████████████████████████████▊                   | 3/5 [00:16<00:11,  5.50s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.50s/it]


### 17 Juan Pablo II

### 18 Alianza Universidad

In [141]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/b9ca1839/{i}/Alianza-Universidad-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/b9ca1839/2020/Alianza-Universidad-Stats', 'https://fbref.com/en/squads/b9ca1839/2021/Alianza-Universidad-Stats', 'https://fbref.com/en/squads/b9ca1839/2022/Alianza-Universidad-Stats', 'https://fbref.com/en/squads/b9ca1839/2023/Alianza-Universidad-Stats', 'https://fbref.com/en/squads/b9ca1839/2024/Alianza-Universidad-Stats']


In [142]:
get_matches(urls, 'Alianza Universidad')

 40%|███████████████████▏                            | 2/5 [00:11<00:16,  5.50s/it]

500


 60%|████████████████████████████▊                   | 3/5 [00:16<00:10,  5.49s/it]

500


 80%|██████████████████████████████████████▍         | 4/5 [00:23<00:05,  5.89s/it]

500


100%|████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.71s/it]


### 19 UTC

In [143]:
urls = []
for i in range(2020, 2025):
    url = f'https://fbref.com/en/squads/afccbca8/{i}/Universidad-Tecnica-de-Cajamarca-Stats'
    urls.append(url)
print(urls)

['https://fbref.com/en/squads/afccbca8/2020/Universidad-Tecnica-de-Cajamarca-Stats', 'https://fbref.com/en/squads/afccbca8/2021/Universidad-Tecnica-de-Cajamarca-Stats', 'https://fbref.com/en/squads/afccbca8/2022/Universidad-Tecnica-de-Cajamarca-Stats', 'https://fbref.com/en/squads/afccbca8/2023/Universidad-Tecnica-de-Cajamarca-Stats', 'https://fbref.com/en/squads/afccbca8/2024/Universidad-Tecnica-de-Cajamarca-Stats']


In [144]:
get_matches(urls, 'UTC')

100%|████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.44s/it]
