In [77]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import requests
import re

# Web scrapping


In this project: we want to answer the following questions: 
a) Are European fans loyal to their teams?
b) Basketball is really a team sport in Europe or they really need star players?

To answer that, we will do web scrapping so we can get different data. We will need Stadium and attendances data, as well as average stats of each player per season.

We will focus on the last edition of the Euroleague so we can get complete information to analyze. In addition, it is the first season since COVID where no restriccions existed.

## Stadium Capacity

We will use Wikipedia information to get the data. In the following cell we can see how did we extract and treated data to achieve this.

In [78]:
url = "https://en.wikipedia.org/wiki/2022%E2%80%9323_EuroLeague"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')


attendance = soup.find('span', {'id': 'Venues_and_locations'})


table = attendance.find_next('table', {'class': 'wikitable'})


data = {'Team': [], 'Arena': [], 'Capacity': []}


for row in table.find_all('tr')[1:]:
    columns = row.find_all(['td', 'th'])

  
    while len(columns) < 4:
        columns.append(None)

    data['Team'].append(columns[0].get_text(strip=True) if columns[0] else None)
    data['Arena'].append(columns[2].get_text(strip=True) if columns[2] else None)
    data['Capacity'].append(columns[3].get_text(strip=True) if columns[3] else None)


df = pd.DataFrame(data)

df['Capacity'] = df['Capacity'].str.replace(',', '').str.strip()  
df['Capacity'] = df['Capacity'].str[:-3]
df['Capacity'] = df['Capacity'].str.replace('[', '').str.strip() 
df.loc[17] =  ["Virtus Segafredo Bologna","PalaDozza","5570"]
df['Capacity'] = df['Capacity'].astype(int)
df.drop(index=17, axis=0, inplace=True)

df

Unnamed: 0,Team,Arena,Capacity
0,ALBA Berlin,Mercedes-Benz Arena,14500
1,Anadolu Efes,Sinan Erdem Dome,16000
2,Barcelona,Palau Blaugrana,7585
3,Bayern Munich,Audi Dome,6500
4,Cazoo Baskonia,Buesa Arena,15504
5,Crvena Zvezda Meridianbet,Aleksandar Nikolić Hall,8000
6,EA7 Emporio Armani Milan,Mediolanum Forum,12700
7,Fenerbahçe Beko,Ülker Sports and Event Hall,13059
8,LDLC ASVEL,Astroballe,5556
9,Maccabi Playtika Tel Aviv,Menora Mivtachim Arena,10383


In [79]:
df.reset_index(drop=True, inplace = True)

In [80]:
df.to_csv('capacity_stadium.csv', index=False)

In [81]:
df = pd.read_csv('capacity_stadium.csv')

## Average Attendance

We do exactly the same with Average attendance.

In [82]:

url = "https://en.wikipedia.org/wiki/2022%E2%80%9323_EuroLeague"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

attendance = soup.find('span', {'id': 'Attendances'})

table = attendance.find_next('table', {'class': 'wikitable'})

data = {'Team': [], 'Average': []}

for row in table.find_all('tr')[1:]:
    columns = row.find_all(['td', 'th'])

    data['Team'].append(columns[1].get_text(strip=True) if columns[0] else None)
    data['Average'].append(columns[5].get_text(strip=True) if columns[1] else None)

df2 = pd.DataFrame(data)
    
df2['Average'] = df2['Average'].str.replace(",", "")
df2['Average'] = df2['Average'].astype(int)

df2

Unnamed: 0,Team,Average
0,Partizan Mozzart Bet,17938
1,Žalgiris,14836
2,Anadolu Efes,13099
3,Final Four in Kaunas,10871
4,Fenerbahçe Beko,10476
5,Olympiacos,10454
6,Maccabi Playtika Tel Aviv,10338
7,EA7 Emporio Armani Milan,9443
8,Cazoo Baskonia,8919
9,ALBA Berlin,8877


In [83]:
df2.to_csv('average_attendance.csv', index=False)

In [84]:
df2 = pd.read_csv('average_attendance.csv')

## League Table

League Standings are also needed to analyze team performances:

In [85]:
url = "https://en.wikipedia.org/wiki/2022%E2%80%9323_EuroLeague"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml') 

league_table = soup.find('span', {'id': 'League_table'})


table = league_table.find_next('table', {'class': 'wikitable'})

data = {'Team': [], 'W': [], 'L': [], 'Position': ['Final Four', 'Final Four','Final Four','Final Four','Playoff','Playoff','Playoff','Playoff','Out','Out','Out','Out','Out','Out','Out','Out','Out','Out']}

for row in table.find_all('tr')[1:]:
    columns = row.find_all(['td', 'th'])

    data['Team'].append(columns[1].get_text(strip=True) if columns[0] else None)
    data['W'].append(columns[3].get_text(strip=True) if columns[1] else None)
    data['L'].append(columns[4].get_text(strip=True) if columns[2] else None)
    
df3 = pd.DataFrame(data)
    
df3.loc[16] =  ["Panathinaikos","9","23", "Out"]
df3

Unnamed: 0,Team,W,L,Position
0,Olympiacos,24,10,Final Four
1,Barcelona,23,11,Final Four
2,Real Madrid,23,11,Final Four
3,AS Monaco,21,13,Final Four
4,Maccabi Playtika Tel Aviv,20,14,Playoff
5,Partizan Mozzart Bet,20,14,Playoff
6,Žalgiris,19,15,Playoff
7,Fenerbahçe Beko,19,15,Playoff
8,Cazoo Baskonia,18,16,Out
9,Crvena zvezda Meridianbet,17,17,Out


In [86]:
df3.to_csv('league_standings.csv', index=False)

In [87]:
df3 = pd.read_csv('league_standings.csv')

## Individual Stats Database

We create a csv file with all individual season stats.

In [88]:
euroleague_stats = '/Ironhack/projects/Project-II/euroleague_player_stats.csv'
euroleague_stats = pd.read_csv("euroleague_player_stats.csv")
euroleague_stats

Unnamed: 0,Player,Team,GP,MPG,PTS,FGM,FGA,FG_Percentage,3PM,3PA,...,FTA,FT_Percentage,OREB,DREB,REB,AST,TOV,STL,BLK,PF
0,Sasha Vezenkov,OLY,40,29.2,17.6,6.5,12.2,0.536,2.0,5.2,...,2.9,0.879,1.6,5.3,6.8,1.9,0.9,0.3,1.0,1.6
1,Wade Baldwin IV,MAC,34,28.1,17.2,6.0,13.8,0.438,1.9,5.8,...,3.9,0.821,0.5,3.2,3.8,5.1,0.9,0.3,2.6,2.8
2,Will Clyburn,EFE,34,33.1,16.7,5.6,12.9,0.436,2.1,5.7,...,4.0,0.830,1.0,4.7,5.7,2.1,1.0,0.2,1.6,2.1
3,Dwayne Bacon,PAN,27,30.8,16.6,5.6,14.4,0.386,1.7,5.7,...,4.7,0.810,0.7,2.8,3.5,2.0,0.7,0.1,1.3,2.6
4,Lorenzo Brown,MAC,35,31.6,16.4,6.1,13.7,0.444,1.9,5.3,...,2.6,0.911,0.3,2.7,3.0,5.5,1.1,0.1,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,Rafi Manco,MAC,22,5.4,1.2,0.5,1.2,0.370,0.2,0.8,...,0.1,1.000,0.4,0.5,0.9,0.3,0.2,0.0,0.2,1.1
239,Eli John N'Diaye,RMB,15,5.7,1.1,0.4,0.9,0.462,0.1,0.4,...,0.3,1.000,0.4,0.8,1.2,0.1,0.2,0.3,0.3,0.9
240,Tristan Vukcevic,PAR,14,6.2,1.1,0.4,1.4,0.300,0.1,0.6,...,0.1,1.000,0.1,0.9,1.1,0.5,0.3,0.2,0.4,1.7
241,Erten Gazi,EFE,11,4.7,0.7,0.2,1.0,0.182,0.0,0.6,...,0.5,0.800,0.3,0.5,0.7,0.1,0.3,0.0,0.1,1.0


### Cleaning Players' Database

In [89]:
euroleague_stats = euroleague_stats[["Player", "Team", "PTS"]]
euroleague_stats

Unnamed: 0,Player,Team,PTS
0,Sasha Vezenkov,OLY,17.6
1,Wade Baldwin IV,MAC,17.2
2,Will Clyburn,EFE,16.7
3,Dwayne Bacon,PAN,16.6
4,Lorenzo Brown,MAC,16.4
...,...,...,...
238,Rafi Manco,MAC,1.2
239,Eli John N'Diaye,RMB,1.1
240,Tristan Vukcevic,PAR,1.1
241,Erten Gazi,EFE,0.7


### Filtering Elite Players

We will filter the whole players' database to understand how individual performances affect team results. We will considerate a player as a Star if his performance is in the top 10% overall. This way, only 24 players made the cut.

In [90]:
star_players = euroleague_stats.head(24)     
star_players

Unnamed: 0,Player,Team,PTS
0,Sasha Vezenkov,OLY,17.6
1,Wade Baldwin IV,MAC,17.2
2,Will Clyburn,EFE,16.7
3,Dwayne Bacon,PAN,16.6
4,Lorenzo Brown,MAC,16.4
5,Kevin Punter,PAR,16.1
6,Vasilije Micic,EFE,16.0
7,Mike James,ASM,15.9
8,Keenan Evans,ZAL,15.9
9,Nikola Mirotic,FCB,15.4


## Count of star players per team

In [92]:
star_players_teams = star_players['Team'].value_counts()

star_players_teams = pd.DataFrame({'Team': star_players_teams.index, 'Players Count': star_players_teams.values})
star_players_teams_rename = {
    'ASM': 'AS Monaco',
    'MAC': 'Maccabi Playtika Tel Aviv',
    'EFE': 'Anadolu Efes',
    'PAN': 'Panathinaikos',
    'PAR': 'Partizan Mozzart Bet',
    'FEN': 'Fenerbahçe Beko',
    'BASK': 'Cazoo Baskonia',
    'ZVE': 'Crvena zvezda Meridianbet',
    'OLY': 'Olympiacos',
    'ZAL': 'Žalgiris',
    'FCB': 'Barcelona',
    'MIL': 'EA7 Emporio Armani Milan',
    'RMB': 'Real Madrid',
    'LYV': 'LDLC ASVEL',
    'VAL': 'Valencia Basket'
}
 
star_players_teams.replace({'Team': star_players_teams_rename}, inplace=True)

star_players_teams['Position'] = star_players_teams['Team'].map({
    'AS Monaco' :  'Final Four',
    'Maccabi Playtika Tel Aviv' : 'Playoff',
    'Anadolu Efes' :'Out',
    'Panathinaikos' : 'Out',
    'Partizan Mozzart Bet' : 'Playoff',
    'Fenerbahçe Beko' : 'Playoff',
    'Cazoo Baskonia' : 'Out',
    'Crvena zvezda Meridianbet' : 'Out',
    'Olympiacos' : 'Final Four',
    'Žalgiris' : 'Playoff',
    'Barcelona' : 'Final Four',
    'EA7 Emporio Armani Milan' : 'Out',
    'Real Madrid' : 'Final Four',
    'LDLC ASVEL' : 'Out',
    'Valencia Basket' : 'Out'})

star_players_teams

Unnamed: 0,Team,Players Count,Position
0,AS Monaco,3,Final Four
1,Maccabi Playtika Tel Aviv,2,Playoff
2,Anadolu Efes,2,Out
3,Panathinaikos,2,Out
4,Partizan Mozzart Bet,2,Playoff
5,Fenerbahçe Beko,2,Playoff
6,Cazoo Baskonia,2,Out
7,Crvena zvezda Meridianbet,2,Out
8,Olympiacos,1,Final Four
9,Žalgiris,1,Playoff


In [93]:
star_players_teams.to_csv('star_count.csv', index = False)

In [94]:
merge_1 = pd.merge(df3,df2, on = 'Team')
attendance_df = pd.merge(merge_1, df, on= 'Team')
attendance_df.drop('Arena', axis = 1, inplace = True)
attendance_df.drop('W', axis = 1, inplace = True)
attendance_df.drop('L', axis = 1, inplace = True)

attendance_df["Percentage of attendance"] = attendance_df["Average"] / attendance_df["Capacity"]
attendance_df

Unnamed: 0,Team,Position,Average,Capacity,Percentage of attendance
0,Olympiacos,Final Four,10454,11847,0.882417
1,Barcelona,Final Four,6357,7585,0.838102
2,Real Madrid,Final Four,8141,13109,0.621024
3,AS Monaco,Final Four,4392,5000,0.8784
4,Maccabi Playtika Tel Aviv,Playoff,10338,10383,0.995666
5,Partizan Mozzart Bet,Playoff,17938,19394,0.924925
6,Žalgiris,Playoff,14836,15415,0.962439
7,Fenerbahçe Beko,Playoff,10476,13059,0.802205
8,Cazoo Baskonia,Out,8919,15504,0.575271
9,Anadolu Efes,Out,13099,16000,0.818688


In [95]:
attendance_df.to_csv('attendance.csv', index=False)