# Setup

In [114]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
response = requests.get("https://liaison.reuters.com/tools/sports-team-codes")

In [5]:
response.status_code

200

In [None]:
response.text

In [12]:
soup = BeautifulSoup(response.text, 'html.parser')

# Explore data on HTML components

In [103]:
leagues_tags = soup.find_all(class_='table-collection')[1].find_all('a',class_="toggle-header")
leagues = [tag.get_text() for tag in leagues_tags]
leagues[:2]

['2022-23 African Champions League', '2022-23 Asian Champions League']

In [101]:
soup.find_all(class_='table-collection')[1].find_all('a')

[<a class="toggle-section-header show">Show All</a>,
 <a class="toggle-header collapsed" data-target="#202223AfricanChampionsLeague" data-toggle="collapse"><span class="arrow"></span>2022-23 African Champions League</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:TRI" target="_blank">TRI</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:ALY" target="_blank">ALY</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:AHI" target="_blank">AHI</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:AIC" target="_blank">AIC</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:ALM" target="_blank">ALM</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:APW" target="_blank">APW</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:AS7" target="_blank">AS7</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:OTO" target="_blank">OTO</a>,
 <a href="https://www.reutersconnect.com/all?search=slug:MDJ" target="_blank">MDJ</

# Fetch data

In [111]:
data_ls = []
current_league = None
for tag in soup.find_all(class_='table-collection')[1].find_all('a'):
    if tag.get('class') == None:
        code = tag.get_text()
        club = tag.find_next().get_text()
        data_ls.append( (current_league, code, club) )
    elif "toggle-header" in tag.get('class'):
        current_league = tag.get_text()

In [112]:
data_ls

[('2022-23 African Champions League', 'TRI', 'Al Ahli Tripoli (LBY)'),
 ('2022-23 African Champions League', 'ALY', 'Al Ahly (EGY)'),
 ('2022-23 African Champions League', 'AHI', 'Al Hilal (SDN)'),
 ('2022-23 African Champions League', 'AIC', 'Al Ittihad (LBY)'),
 ('2022-23 African Champions League', 'ALM', 'Al Merrikh (SDN)'),
 ('2022-23 African Champions League', 'APW', 'APR (RWA)'),
 ('2022-23 African Champions League', 'AS7', 'Arta Solar 7 (DJI)'),
 ('2022-23 African Champions League', 'OTO', 'AS Otoho (COG)'),
 ('2022-23 African Champions League', 'MDJ', 'AS Stade Mandji (GAB)'),
 ('2022-23 African Champions League', 'AVC', 'AS Vita Club (COD)'),
 ('2022-23 African Champions League', 'ASK', 'Asante Kotoko (GHA)'),
 ('2022-23 African Champions League', 'MIM', 'ASEC Mimosas (CIV)'),
 ('2022-23 African Champions League', 'KOZ', 'ASKO Kara (TGO)'),
 ('2022-23 African Champions League', 'ASN', 'ASN Nigelec (NER)'),
 ('2022-23 African Champions League', 'ABB', 'Black Bulls (MOZ)'),
 ('2

In [118]:
columns = ['league','code','club']
df_clubs = pd.DataFrame(data_ls,columns=columns)
df_clubs

Unnamed: 0,league,code,club
0,2022-23 African Champions League,TRI,Al Ahli Tripoli (LBY)
1,2022-23 African Champions League,ALY,Al Ahly (EGY)
2,2022-23 African Champions League,AHI,Al Hilal (SDN)
3,2022-23 African Champions League,AIC,Al Ittihad (LBY)
4,2022-23 African Champions League,ALM,Al Merrikh (SDN)
...,...,...,...
786,Turkish Super League (2022-23),KSK,Kasimpasa
787,Turkish Super League (2022-23),KON,Kayserispor
788,Turkish Super League (2022-23),SIV,Konyaspor
789,Turkish Super League (2022-23),TRA,Sivasspor


# Cleaning

In [125]:
list(df_clubs['league'].unique())

['2022-23 African Champions League',
 '2022-23 Asian Champions League',
 '2023 CONCACAF Champions league',
 '2023 Copa Libertadores',
 'Argentine Premier League (2023)',
 'Austrian Bundesliga (2022-23)',
 'Belgian First Division (2022-23)',
 'Brasileiro Serie A (2023)',
 'Champions League (2022-23) - GROUP STAGE ',
 'Chinese Super League (2023)',
 'Danish Superliga (2022-23)',
 'Dutch Eredivisie (2022-23)',
 'English Championship (2022-2023)',
 'English League One (2022-23)',
 'English League Two (2022-23)',
 'English National League (2022-23)',
 'English Premier League (2022-23)',
 'Europa Conference League (2022-23) - GROUP STAGE',
 'Europa League (2022-23) - GROUP STAGE',
 'French Ligue 1 (2022-23)',
 'German Bundesliga (2022-23)',
 'German Bundesliga 2 (2022-23)',
 'Greek Super League (2022-23)',
 'Italian Serie A (2022-23)',
 'J-League (2023)',
 'Major League Soccer (2023)',
 'Mexican Liga MX (2022-2023)',
 'Portuguese Primeira Liga (2022-23)',
 'Russian Premier League (2022-23)',

In [126]:
eu_leagues = [
    'Austrian Bundesliga (2022-23)',
    'Danish Superliga (2022-23)',
    'Dutch Eredivisie (2022-23)',
    'English Championship (2022-2023)',
    'English League One (2022-23)',
    'English League Two (2022-23)',
    'English National League (2022-23)',
    'English Premier League (2022-23)',
    'French Ligue 1 (2022-23)',
    'German Bundesliga (2022-23)',
    'German Bundesliga 2 (2022-23)',
    'Greek Super League (2022-23)',
    'Italian Serie A (2022-23)',
    'Portuguese Primeira Liga (2022-23)',
    'Russian Premier League (2022-23)',
    'Scottish Premiership (2022-23)',
    'Spanish LaLiga (2022-23)',
    'Swedish First Division (2023)',
    'Swiss Super League (2022-23)',
    'Turkish Super League (2022-23)',
    "Major League Soccer (2023)",
    'Argentine Premier League (2023)',
]

In [132]:
df_eu_clubs = df_clubs[df_clubs.league.apply(lambda x: x in eu_leagues)]
df_eu_clubs

Unnamed: 0,league,code,club
197,Austrian Bundesliga (2022-23),AKL,Austria Klagenfurt
198,Austrian Bundesliga (2022-23),ALU,Austria Lustenau
199,Austrian Bundesliga (2022-23),VIE,Austria Vienna
200,Austrian Bundesliga (2022-23),LIN,LASK Linz
201,Austrian Bundesliga (2022-23),RAV,Rapid Vienna
...,...,...,...
786,Turkish Super League (2022-23),KSK,Kasimpasa
787,Turkish Super League (2022-23),KON,Kayserispor
788,Turkish Super League (2022-23),SIV,Konyaspor
789,Turkish Super League (2022-23),TRA,Sivasspor


In [156]:
df_eu_clubs['league'] = df_eu_clubs['league'].apply(lambda x: x.split('(')[0].strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eu_clubs['league'] = df_eu_clubs['league'].apply(lambda x: x.split('(')[0].strip())


In [157]:
df_eu_clubs.league.unique()

array(['Austrian Bundesliga', 'Danish Superliga', 'Dutch Eredivisie',
       'English Championship', 'English League One', 'English League Two',
       'English National League', 'English Premier League',
       'French Ligue 1', 'German Bundesliga', 'German Bundesliga 2',
       'Greek Super League', 'Italian Serie A',
       'Portuguese Primeira Liga', 'Russian Premier League',
       'Scottish Premiership', 'Spanish LaLiga', 'Swedish First Division',
       'Swiss Super League', 'Turkish Super League'], dtype=object)

In [158]:
df_eu_clubs.to_csv('clubs.csv',index=False)