<a href="https://colab.research.google.com/github/slehkyi/notebooks-for-articles/blob/master/Web_Scrapping_Understats_com_for_xG_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
# print(os.listdir("../input"))

import requests
from bs4 import BeautifulSoup

In [0]:
# create urls for all seasons of all leagues
base_url = 'https://understat.com/league'
league = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1']
season = ['2014', '2015', '2016', '2017', '2018']

url = base_url+'/'+league[0]+'/'+season[4]
res = requests.get(url)
soup = BeautifulSoup(res.content)

# Based on the structure of the webpage, I found that data is in the JSON variable, under <script> tags
scripts = soup.find_all('script')

# Check our <script> tags
# for el in scripts:
#   print('*'*50)
#   print(el.text)

In [0]:
import json

string_with_json_obj = ''

# Find data for teams
for el in scripts:
    if 'teamsData' in el.text:
      string_with_json_obj = el.text.strip()
      
# print(string_with_json_obj)

# strip unnecessary symbols and get only JSON data
ind_start = string_with_json_obj.index("('")+2
ind_end = string_with_json_obj.index("')")
json_data = string_with_json_obj[ind_start:ind_end]

json_data = json_data.encode('utf8').decode('unicode_escape')

In [0]:
# convert JSON data into Python dictionary
data = json.loads(json_data)

# Print pretty JSON data to check out what we have there
# s = json.dumps(data, indent=4, sort_keys=True)
# print(s)

In [0]:
# Get teams and their relevant ids and put them into separate dictionary
teams = {}
for id in data.keys():
  teams[id] = data[id]['title']

In [0]:
# EDA to get a feeling of how the JSON is structured
# Column names are all the same, so we just use first element
columns = []
# Check the sample of values per each column
values = []
for id in data.keys():
  columns = list(data[id]['history'][0].keys())
  values = list(data[id]['history'][0].values())
  break

print(columns)
print(values)

['h_a', 'xG', 'xGA', 'npxG', 'npxGA', 'ppda', 'ppda_allowed', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'result', 'date', 'wins', 'draws', 'loses', 'pts', 'npxGD']
['a', 3.34997, 2.39239, 3.34997, 1.64976, {'att': 252, 'def': 19}, {'att': 263, 'def': 20}, 12, 5, 4, 1, 1.9829999999999999, 'w', '2018-08-19 21:15:00', 1, 0, 0, 3, 1.70021]


In [0]:
# Found that Sevilla has the id=138, so getting all the data for this team to be able to reproduce the same steps for all teams in the league.
sevilla_data = []
for row in data['138']['history']:
  sevilla_data.append(list(row.values()))
  
df = pd.DataFrame(sevilla_data, columns=columns)
df.head(2)

Unnamed: 0,h_a,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,deep_allowed,scored,missed,xpts,result,date,wins,draws,loses,pts,npxGD
0,a,3.34997,2.39239,3.34997,1.64976,"{'att': 252, 'def': 19}","{'att': 263, 'def': 20}",12,5,4,1,1.983,w,2018-08-19 21:15:00,1,0,0,3,1.70021
1,h,1.97161,0.671429,1.97161,0.671429,"{'att': 262, 'def': 16}","{'att': 237, 'def': 26}",11,3,0,0,2.3331,d,2018-08-26 21:15:00,0,1,0,1,1.300181


In [0]:
# Getting data for all teams
dataframes = {}
for id, team in teams.items():
  teams_data = []
  for row in data[id]['history']:
    teams_data.append(list(row.values()))
    
  df = pd.DataFrame(teams_data, columns=columns)
  dataframes[team] = df
  print('Added data for {}.'.format(team))
  

Added data for Sevilla.
Added data for Real Sociedad.
Added data for Espanyol.
Added data for Getafe.
Added data for Atletico Madrid.
Added data for Rayo Vallecano.
Added data for Valencia.
Added data for Athletic Club.
Added data for Barcelona.
Added data for Real Madrid.
Added data for Levante.
Added data for Celta Vigo.
Added data for Real Betis.
Added data for Villarreal.
Added data for Eibar.
Added data for Alaves.
Added data for Leganes.
Added data for Girona.
Added data for Real Valladolid.
Added data for SD Huesca.


In [0]:
# Sample check of our newly created DataFrame
dataframes['Barcelona'].head(2)

Unnamed: 0,h_a,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,deep_allowed,scored,missed,xpts,result,date,wins,draws,loses,pts,npxGD
0,h,3.26753,0.248353,3.26753,0.248353,"{'att': 118, 'def': 17}","{'att': 407, 'def': 13}",20,0,3,0,2.9009,w,2018-08-18 23:15:00,1,0,0,3,3.019177
1,a,1.20392,0.510742,1.20392,0.510742,"{'att': 163, 'def': 16}","{'att': 316, 'def': 15}",15,4,1,0,1.9865,w,2018-08-25 23:15:00,1,0,0,3,0.693178


In [0]:
for team, df in dataframes.items():
  dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'])
  dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'])
  
# And check how our new dataframes look based on Sevilla dataframe
dataframes['Sevilla'].head(2)

Unnamed: 0,h_a,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,deep_allowed,scored,missed,xpts,result,date,wins,draws,loses,pts,npxGD,ppda_coef,ppda_allowed_coef,oppda_coef
0,a,3.34997,2.39239,3.34997,1.64976,"{'att': 252, 'def': 19}","{'att': 263, 'def': 20}",12,5,4,1,1.983,w,2018-08-19 21:15:00,1,0,0,3,1.70021,13.263158,13.15,13.15
1,h,1.97161,0.671429,1.97161,0.671429,"{'att': 262, 'def': 16}","{'att': 237, 'def': 26}",11,3,0,0,2.3331,d,2018-08-26 21:15:00,0,1,0,1,1.300181,16.375,9.115385,9.115385


In [0]:
cols_to_sum = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
cols_to_mean = ['ppda_coef', 'oppda_coef']

In [0]:
frames = []
for team, df in dataframes.items():
  sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
  mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
  final_df = sum_data.join(mean_data)
  final_df['team'] = team
  frames.append(final_df)
  
full_stat = pd.concat(frames)

In [0]:
full_stat['matches'] = 38.0
full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'npxG', 'xGA', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts']]
full_stat.sort_values('pts', ascending=False, inplace=True)
full_stat.reset_index(inplace=True, drop=True)
full_stat.head(10)

Unnamed: 0,team,matches,wins,draws,loses,scored,missed,pts,xG,npxG,xGA,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts
0,Barcelona,38.0,26.0,9.0,3.0,90.0,36.0,87.0,83.279534,76.584704,44.93095,43.4444,33.140304,9.015264,16.404885,417.0,171.0,73.9604
1,Atletico Madrid,38.0,22.0,10.0,6.0,55.0,29.0,76.0,51.872324,48.730936,41.434059,37.717794,11.013142,11.066205,11.104778,252.0,190.0,59.4276
2,Real Madrid,38.0,21.0,5.0,12.0,63.0,46.0,68.0,68.654773,61.965525,48.675753,42.729664,19.235861,8.896578,14.782501,341.0,168.0,64.7728
3,Valencia,38.0,15.0,16.0,7.0,51.0,35.0,61.0,61.884949,56.569019,42.854367,36.90841,19.660609,12.964658,9.474007,278.0,215.0,65.1616
4,Sevilla,38.0,17.0,8.0,13.0,62.0,47.0,59.0,69.162306,64.542453,46.710732,41.508815,23.033638,10.652031,10.02062,321.0,211.0,65.0781
5,Getafe,38.0,15.0,14.0,9.0,48.0,35.0,59.0,47.034178,42.578151,44.226041,39.02311,3.555041,8.766903,5.700771,186.0,196.0,53.1872
6,Espanyol,38.0,14.0,11.0,13.0,48.0,50.0,53.0,50.157346,47.184228,54.616086,48.548747,-1.364519,9.856489,9.819171,241.0,241.0,50.0883
7,Athletic Club,38.0,13.0,14.0,11.0,41.0,45.0,53.0,44.441324,38.917079,47.158818,43.442428,-4.525349,8.304492,11.302558,221.0,185.0,50.0122
8,Real Sociedad,38.0,13.0,11.0,14.0,45.0,46.0,50.0,47.987101,40.554466,48.09444,45.68397,-5.129504,9.943556,9.488771,194.0,208.0,51.1255
9,Alaves,38.0,13.0,11.0,14.0,39.0,50.0,50.0,40.873013,38.643174,54.532088,50.072421,-11.429247,11.22823,7.09671,129.0,270.0,44.0162
