In [101]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def generate_teams_id_dataframe(season: str) -> list:
    url = f'https://en.wikipedia.org/wiki/{season}_Premier_League'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the team names
    if season == '2019-20':
        table = soup.find('table', {'class': 'wikitable sortable'})
    else:
        table = soup.find('table', {'class': 'wikitable'})
    # Extract the team names from the table
    team_names = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cells = row.find_all('td')
        if len(cells) > 0:
            team_name = cells[0].text.strip()
            if 'London' not in team_name:
                team_names.append(team_name)

    return team_names

In [102]:
generate_teams_id_dataframe('2018-19')

<table class="wikitable sortable">
<tbody><tr>
<th>Team
</th>
<th>Location
</th>
<th>Stadium
</th>
<th>Capacity<sup class="reference" id="cite_ref-23"><a href="#cite_note-23">[23]</a></sup>
</th></tr>
<tr>
<td><a href="/wiki/Arsenal_F.C." title="Arsenal F.C.">Arsenal</a>
</td>
<td><a href="/wiki/London" title="London">London</a> <span style="font-size:85%;">(<a href="/wiki/Holloway,_London" title="Holloway, London">Holloway</a>)</span>
</td>
<td><a href="/wiki/Emirates_Stadium" title="Emirates Stadium">Emirates Stadium</a>
</td>
<td style="text-align:center"><span data-sort-value="7004602600000000000♠">60,260</span>
</td></tr>
<tr>
<td><a href="/wiki/AFC_Bournemouth" title="AFC Bournemouth">Bournemouth</a>
</td>
<td><a href="/wiki/Bournemouth" title="Bournemouth">Bournemouth</a>
</td>
<td><a href="/wiki/Dean_Court" title="Dean Court">Dean Court</a>
</td>
<td style="text-align:center"><span data-sort-value="7004113290000000000♠">11,329</span>
</td></tr>
<tr>
<td><a href="/wiki/Brighton_

['Arsenal',
 'Bournemouth',
 'Brighton & Hove Albion',
 'Burnley',
 'Cardiff City',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Huddersfield Town',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Southampton',
 'Tottenham Hotspur',
 'Watford',
 'West Ham United',
 'Wolverhampton Wanderers']

In [103]:
seasons = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23']

In [104]:
def populate_team_dataframe(seasons: list) -> pd.DataFrame:
    teams = pd.DataFrame(columns=['teams', 'team_id', 'season'])
    for season in seasons:
        temp = pd.DataFrame(generate_teams_id_dataframe(season), columns=['teams'])
        temp['team_id'] = [i for i in range(1,len(temp.teams)+1)]
        temp['season'] = season
        teams = pd.concat([teams, temp], ignore_index=True)
    return teams

In [105]:
teams = populate_team_dataframe(seasons)

<table class="wikitable sortable">
<tbody><tr>
<th>Team
</th>
<th>Location and County
</th>
<th>Stadium
</th>
<th>Capacity<sup class="reference" id="cite_ref-11"><a href="#cite_note-11">[11]</a></sup>
</th></tr>
<tr>
<td><a href="/wiki/Arsenal_F.C." title="Arsenal F.C.">Arsenal</a>
</td>
<td><a href="/wiki/London" title="London">London</a> <span style="font-size:85%;">(<a href="/wiki/Holloway,_London" title="Holloway, London">Holloway</a>)</span>
</td>
<td><a href="/wiki/Emirates_Stadium" title="Emirates Stadium">Emirates Stadium</a>
</td>
<td>60,432
</td></tr>
<tr>
<td><a href="/wiki/AFC_Bournemouth" title="AFC Bournemouth">Bournemouth</a>
</td>
<td><a href="/wiki/Bournemouth" title="Bournemouth">Bournemouth</a>
</td>
<td><a href="/wiki/Dean_Court" title="Dean Court">Dean Court</a>
</td>
<td>11,464
</td></tr>
<tr>
<td><a href="/wiki/Burnley_F.C." title="Burnley F.C.">Burnley</a>
</td>
<td><a href="/wiki/Burnley" title="Burnley">Burnley</a>
</td>
<td><a href="/wiki/Turf_Moor" title="Tu

In [106]:
teams

Unnamed: 0,teams,team_id,season
0,Arsenal,1,2016-17
1,Bournemouth,2,2016-17
2,Burnley,3,2016-17
3,Chelsea,4,2016-17
4,Crystal Palace,5,2016-17
...,...,...,...
135,Nottingham Forest,16,2022-23
136,Southampton,17,2022-23
137,Tottenham Hotspur,18,2022-23
138,West Ham United,19,2022-23


In [107]:
def add_promotion_indicator(teams: pd.DataFrame, seasons: list) -> pd.DataFrame:
    promotion_indicator = []
    for i, team in teams.iterrows():
        if team['season'] == seasons[0]:
            promotion_indicator.append(0)
        elif len(teams[(teams['teams'] == team['teams']) & (teams['season'] == seasons[seasons.index(team['season'])-1])]) > 0:
            promotion_indicator.append(0)
        else:
            promotion_indicator.append(1)
    teams['promotion_indicator'] = promotion_indicator
    return teams


In [108]:
teams = add_promotion_indicator(teams, seasons)
teams

Unnamed: 0,teams,team_id,season,promotion_indicator
0,Arsenal,1,2016-17,0
1,Bournemouth,2,2016-17,0
2,Burnley,3,2016-17,0
3,Chelsea,4,2016-17,0
4,Crystal Palace,5,2016-17,0
...,...,...,...,...
135,Nottingham Forest,16,2022-23,1
136,Southampton,17,2022-23,0
137,Tottenham Hotspur,18,2022-23,0
138,West Ham United,19,2022-23,0


In [110]:
teams.to_csv('/Users/storm/Documents/GitHub/Fantasy-Premier-League/webscraper_output/teams_webscraper.csv')