#1. Scraping Premier League Data 
> Thanks to Premier League Webiste https://www.premierleague.com <br/>

> The collected data is finally put into a Google Data Studio dashboard<br/>

> Uses two techniques to collect data 1. Selenium - for dynamically loaded pages 2. Opta API<br/>

> Scraping to get player data, team related data and the player statistics<br/>

> Collects data between 2015-2016 and 2019-2020 season


## 1.1 Loading Modules

In [0]:
%%capture
!pip install selenium
from selenium import webdriver
!wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
!tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
!cp phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin
#!ls -al
driver = webdriver.PhantomJS('phantomjs')
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

##1.2 Makes use of API to scrape data

In [0]:
import requests
import pandas as pd
import json
from tqdm import tqdm

class ScrapPremData:
  """ This class gets the data from the premier league website """
 
  def getPlayersData(self, season_details, url, header, queryParams):

    """ Gets player bio data"""

    column_names = ['Season_Id', 'Player_Id', 'Position', 'ShirtNum', 'PositionInfo', 'Country', 'Club', 'Team_Id', 'DOB', 'Age', 'Name']
    season_full = []

    for season in tqdm(season_details.keys()):

        s = season_details[season]

        for season_page in range(0,10):
          ## Start of block 1 ##  
          season_temp = []
          queryParams['page'] = season_page
          try:
            response = requests.get(url = url, headers = header, params = queryParams)
            if response.status_code == 200:
              data = json.loads(response.text)
            else:
              break
          except Exception as e: print(e)
          ## End of block 1 ##

          ## Start of block 2 ##
          
          if len(data['content']) > 0:
            for player in data["content"]:
              try:  
                  player_id = int(player['id'])
                  position = player['info']['position']
                  shirtNum = int(player['info']['shirtNum'])
                  positionInfo = player['info']['positionInfo']
                  country = player['nationalTeam']['country']
                  club = player['currentTeam']['name']
                  team_id = int(player['currentTeam']['id'])
                  dob = player['birth']['date']['label']
                  age = player['age']
                  name = player['name']['display']
                  season_temp.append([s, player_id, position, shirtNum, positionInfo, country, club, team_id, dob, age, name])
              except: pass
          else:
            break
          season_full += season_temp
          ## End of block 2 ##
      
    df = pd.DataFrame(season_full, columns = column_names)
    return df

  def getTeamsData(self, season_details, url, header, params):

    """ Gets teams related data"""

    column_names = ['Season_Id','Team_Name', 'Team_Id', 'Ground', 'City', 'Capacity']
    season_full = []
    for season in tqdm(season_details.keys()):
      s = season_details[season]
      ## Start of block 1 ##
      try:
        response = requests.get(url = url, headers = header, params = queryParams)
        if response.status_code == 200:
          data = json.loads(response.text)
        else:
          break
      except Exception as e: print(e)
      ## End of block 1 ##

      ## Start of block 2 ##
          
      if len(data['content']) > 0:
        for team in data['content']:
          try:
            Team_Name = team['name']
            Team_Id = int(team['club']['id'])
            ground = team['grounds'][0]['name']
            city = team['grounds'][0]['city']
            capacity = int(team['grounds'][0]['capacity'])
            season_full.append([s, Team_Name, Team_Id, ground, city, capacity])
          except: pass
    return pd.DataFrame(season_full, columns = column_names)

  def getPlayerStats(self, season_details, player_df, header, queryParams):

    """ Gets player statistics data 
        Empty columns were handles as -1 for clarity
    """

    full_details = []
    column_names = []
    for index, row in tqdm(player_df.iterrows()):
      url = "https://footballapi.pulselive.com/football/stats/player/"+str(row['Player_Id'])
      queryParams['compSeasons'] = list(season_details.keys())[list(season_details.values()).index(row['Season_Id'])]
      response = requests.get(url = url, headers = header, params = queryParams)
      if response.status_code == 200:
        try:
          data = json.loads(response.text)
          if len(data['stats']) > 0:
            data1 = {i['name'] : i['value'] for i in data['stats']}
            data1['Player_Id'] = row['Player_Id']; data1['Season_Id'] = season_details[queryParams['compSeasons']]
            full_details.append(data1)
          else:
            pass
        except Exception as e:
          pass
      else:
          pass
    return pd.DataFrame.from_dict(full_details).fillna(-1)

  

## 1.3 Scraping player's bio data

In [0]:
header = {
      "content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
      "DNT": "1",
      "Origin": "https://www.premierleague.com",
      "Referer": "https://www.premierleague.com/players",
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
queryParams = {
    "pageSize": 200,
    "compSeasons": 274,
    "altIds": True,
    "type": "player",
    "id": -1,
    "compSeasonId": 274
}
season_details = {42 : '2015-2016', 54 : '2016-2017', 79 : '2017-2018', 210 : '2018-2019', 274 : '2019-2020'}
player_details = ScrapPremData().getPlayersData(season_details, "https://footballapi.pulselive.com/football/players", header, queryParams)



player_details.to_csv('Prem_players.csv', sep = '|', index = False)

#from google.colab import drive
#drive.mount('/content/drive')
#!cp Prem_players.csv "/content/drive/My Drive/"

100%|██████████| 5/5 [00:32<00:00,  6.45s/it]


## 1.4 Scraping team related data

In [0]:
header = {
      "content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
      "DNT": "1",
      "Origin": "https://www.premierleague.com",
      "Referer": "https://www.premierleague.com/players",
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
queryParams = {
    "pageSize": 100,
    "altIds": True,
    "page": 0,
    "type": "player",
    "compCodeForActivePlayer": -1,
    "comps" : 1
}

team_details = ScrapPremData().getTeamsData(season_details, "https://footballapi.pulselive.com/football/teams", header, queryParams)

team_details.to_csv('team_details.csv', sep = '|', index = False)

#from google.colab import drive
#drive.mount('/content/drive')
#!cp team_details.csv "/content/drive/My Drive/"

100%|██████████| 5/5 [00:00<00:00,  7.23it/s]


##1.5 Scraping player's statistics

In [0]:
header = {
      "content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
      "DNT": "1",
      "Origin": "https://www.premierleague.com",
      "Referer": "https://www.premierleague.com/players",
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
queryParams = {
    "co": 1
}

player_stats = ScrapPremData().getPlayerStats(season_details, player_details, header, queryParams)

player_stats.to_csv('player_stats.csv', sep = '|', index = False)

#from google.colab import drive
#drive.mount('/content/drive')
#!cp team_details.csv "/content/drive/My Drive/"

##1.6 Post processing player statistics with (football knowledge)
> A few post processing techniques for easy data loading

In [0]:
player_stats_processed = pd.DataFrame()
player_stats_processed['Player_Id'] = player_stats['Player_Id']
player_stats_processed['Season_Id'] = player_stats['Season_Id']
# Attack
player_stats_processed['pass_percent'] = (player_stats['accurate_pass'] / player_stats['total_pass'])*100
player_stats_processed['cross_percent'] = (player_stats['accurate_cross']/player_stats['total_cross'])*100
player_stats_processed['free_kicks_goals'] = player_stats['att_freekick_goal']
player_stats_processed['big_chances_created'] =  player_stats['big_chance_created'] + player_stats['big_chance_scored']
player_stats_processed['big_chances_missed'] =   player_stats['big_chance_missed']
player_stats_processed['goals'] = player_stats['goals']
player_stats_processed['goal_assist'] = player_stats['goal_assist']
player_stats_processed['touches'] = player_stats['touches']
# Defence
player_stats_processed['ariel_success_percent'] = (player_stats['aerial_won']/ (player_stats['aerial_won'] + player_stats['aerial_lost']))*100
player_stats_processed['total_recoveries_blocks'] = player_stats['ball_recovery'] + player_stats['blocked_cross'] + player_stats['blocked_pass'] + player_stats['blocked_scoring_att']
player_stats_processed['interception'] = player_stats['interception_won'] + player_stats['interceptions_in_box']
player_stats_processed['successful_contest_percent'] = (player_stats['won_contest'] / player_stats['total_contest'])*100
player_stats_processed['successful_tackles_percent'] = (player_stats['won_tackle'] / player_stats['total_tackle'])*100
player_stats_processed['last_man_tackle'] = player_stats['last_man_tackle']
player_stats_processed['clean_sheet'] = player_stats['clean_sheet']
player_stats_processed['effective_clearances'] = player_stats['effective_clearance'] + player_stats['effective_head_clearance']
# GK
player_stats_processed['successful_dive_saves'] = player_stats['dive_catch'] + player_stats['dive_save'] + player_stats['diving_save']
player_stats_processed['keeper_throws'] = player_stats['keeper_throws']
player_stats_processed['penalty_faced'] = player_stats['penalty_faced']
player_stats_processed['penalty_save'] = player_stats['penalty_save']
player_stats_processed['pen_goals_conceded'] = player_stats['pen_goals_conceded']
player_stats_processed['punches'] = player_stats['punches']
player_stats_processed['saves'] = player_stats['saves']
player_stats_processed['standing_saves'] = player_stats['stand_catch'] + player_stats['stand_save']
player_stats_processed['accurate_goal_kicks'] = player_stats['accurate_goal_kicks']
player_stats_processed['accurate_keeper_sweeper'] = player_stats['accurate_keeper_sweeper']
player_stats_processed['accurate_keeper_throws'] = player_stats['accurate_keeper_throws']
player_stats_processed['keeper_pick_up'] = player_stats['keeper_pick_up']

#General
player_stats_processed['appearances'] = player_stats['appearances']
player_stats_processed['draws'] = player_stats['draws']
player_stats_processed['game_started'] = player_stats['game_started']
player_stats_processed['losses'] = player_stats['losses']
player_stats_processed['mins_played'] = player_stats['mins_played']
player_stats_processed['wins'] = player_stats['wins']
player_stats_processed['total_sub_off'] = player_stats['total_sub_off']
player_stats_processed['total_sub_on'] = player_stats['total_sub_on']
player_stats_processed['total_distance_in_m'] = player_stats['total_distance_in_m']
player_stats_processed['was_fouled'] = player_stats['was_fouled']
#Discipline
player_stats_processed['dangerous_play'] = player_stats['dangerous_play']
player_stats_processed['dispossessed'] = player_stats['dispossessed']
player_stats_processed['fouls'] = player_stats['fouls']
player_stats_processed['own_goals'] = player_stats['own_goals']
player_stats_processed['red_card'] = player_stats['red_card']
player_stats_processed['yellow_card'] = player_stats['yellow_card']

In [0]:
player_stats_final = pd.DataFrame()
player_stats_final['Season_Id'] = player_stats_processed['Season_Id']
num = player_stats_processed._get_numeric_data()
num = num.clip(lower = 0)
player_stats_final = pd.concat([player_stats_final, num], axis = 1)
player_stats_final_temp = player_stats_final[['pass_percent',	'cross_percent', 'ariel_success_percent', 'successful_contest_percent', 'successful_tackles_percent']]
player_stats_final.drop(columns = ['pass_percent',	'cross_percent', 'ariel_success_percent', 'successful_contest_percent', 'successful_tackles_percent'], inplace = False)
player_stats_final_temp[player_stats_final_temp >= 100] = 0
player_stats_final_temp = player_stats_final_temp.apply(lambda x : round(x,2))
player_stats_final = pd.concat([player_stats_final, player_stats_final_temp], axis = 1)
player_stats_final.to_csv('player_stats_final.csv', sep = '|', index = False)

## 1.7 Getting some more data related to player's history and psychique - Uses SELENIUM

In [0]:
from collections import namedtuple
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import warnings 
warnings.filterwarnings("ignore")

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87")

driver = webdriver.PhantomJS(desired_capabilities=dcap)

season_overview_stats = namedtuple('season_overview_stats',['Appearances', 'Goals', 'Losses', 'Wins'])
player_basic_details = namedtuple('player_basic_details',['Player_Id', 'DP', 'Country', 'DOB', 'Height'])

def get_basic_player_details(URL):
  driver.get(URL)
  bpd = driver.find_element_by_xpath('//*[@id="mainContent"]/section/div[2]/div[1]')
  time.sleep(1)
  p_id = re.findall(r'\d+', URL)[0]
  dp = BeautifulSoup(bpd.get_attribute('innerHTML')).findAll('img')[0]['src']
  pl = driver.find_element_by_class_name('personalLists')
  try:
    c, d, h, w = [i.text.replace('\n', '').strip() for i in BeautifulSoup(pl.get_attribute('innerHTML')).findAll("div", {'class': 'info'})]
  except:
    c, d, h = [i.text.replace('\n', '').strip() for i in BeautifulSoup(pl.get_attribute('innerHTML')).findAll("div", {'class': 'info'})]
  return player_basic_details(p_id, dp, c, d, h)
  
#get_basic_player_details('https://www.premierleague.com/players/13279/player/overview')

def season_wise_overview(URL):
  driver = webdriver.PhantomJS(desired_capabilities=dcap)
  driver.get(URL)
  pg = driver.find_element_by_xpath('//*[@id="mainContent"]/div[3]/div/div/div[3]/table/tbody')
  team_details = BeautifulSoup(pg.get_attribute('innerHTML'))
  season = [i.text for i in team_details.findAll('td', {'class': 'season'})]
  season_teams = {s:a for s, a in zip(season, [i.text.strip('\n').strip() for i in team_details.findAll('span', {'class': 'long'})])}
  p_id = re.findall(r'\d+', URL)[0]
  return season_teams
#season_wise_overview('https://www.premierleague.com/players/13279/player/overview')

season_overview_stats = namedtuple('season_overview_stats',['Appearances', 'Goals', 'Losses', 'Wins'])
"""---------------------------------Unused-------------------------------------"""
def player_season_stats(URL):
  driver.get(URL)
  time.sleep(2)
  ss= driver.find_element_by_xpath('//*[@id="mainContent"]/div[3]/div/div/div[2]/div/div/div')
  return season_overview_stats(*(i.text.replace('\n','').split()[1] for i in BeautifulSoup(ss.get_attribute('innerHTML')).findAll('span', {'class':'stat'})))

#player_season_stats('https://www.premierleague.com/players/13279/player/stats?co=1&se=274')


def full_player_stats(URL):
  driver.get(URL)
  time.sleep(2)
  ss= driver.find_element_by_xpath('//*[@id="mainContent"]/div[3]/div/div/div[2]/div/div/ul')
  return {re.split(r'\s{2,}',i.text.replace('\n','').strip())[0]:re.split(r'\s{2,}',i.text.replace('\n','').strip())[1] for i in BeautifulSoup(ss.get_attribute('innerHTML')).findAll('div', {'class':'normalStat'})}
#full_player_stats('https://www.premierleague.com/players/13279/player/stats?co=1&se=274')
"""---------------------------------Unused-------------------------------------"""

In [0]:
def create_player_stats_data(player_ids):
  basic_player_data = []
  season_wise_overview_data = dict()
  for player_id in tqdm(player_ids):
    try:
      basic_player_data.append(get_basic_player_details('https://www.premierleague.com/players/{0}/player/overview'.format(player_id)))
      season_wise_overview_data[player_id] = season_wise_overview('https://www.premierleague.com/players/{0}/player/overview'.format(player_id))
    except:
      pass
  return basic_player_data, season_wise_overview_data

basic_player_data, season_wise_overview_data = create_player_stats_data(set(player_details.Player_Id))

100%|██████████| 659/659 [1:11:41<00:00,  6.53s/it]


In [0]:
#------------------------- Some data cleaning in the basic data ------------------------------------
basic_player_data_df = pd.DataFrame(basic_player_data)
basic_player_data_df.Player_Id = basic_player_data_df.Player_Id.apply(int)

In [0]:
#------------------------- Merging columns to make it unnormalized for data loading into data studio ------------------------------------
season_wise_overview_data_df = pd.DataFrame.from_dict(season_wise_overview_data, orient = 'index')
season_wise_overview_data_df.index.rename('Player_Id', inplace = True)
season_wise_overview_data_df = season_wise_overview_data_df[sorted(season_wise_overview_data_df.columns)[-5:]].reset_index()
season_wise_overview_data_df = pd.melt(season_wise_overview_data_df, id_vars=['Player_Id'])
season_wise_overview_data_df.variable = season_wise_overview_data_df.variable.apply(lambda x : x.replace('/', '-'))

In [0]:
#------------------------- A few more added details and exporting the unnormalized data ------------------------------------
player_stats_final = pd.read_csv('player_stats_final.csv', sep = ',')
unormalized_data = player_stats_final.merge(basic_player_data_df, how = 'left', on = 'Player_Id')
unormalized_data = unormalized_data.merge(player_details[['Player_Id', 'Name', 'PositionInfo', 'ShirtNum']], how = 'left', on = 'Player_Id').drop_duplicates()
unormalized_data = unormalized_data.merge(season_wise_overview_data_df, how = 'left', left_on = ['Player_Id', 'Season_Id'], right_on = ['Player_Id', 'variable'])
unormalized_data.to_csv('data.csv', sep = ',', index = False)