In [2]:
import numpy as np
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains

from typing import List, Tuple, Dict
import sys
sys.path.append("../src/")
import player_data_scraper as pds
import csv

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
def mls_fantasy_login(login_id: str,
                     password: str, 
                     mls_fantasy_url: str = "https://fantasy.mlssoccer.com/#") -> Chrome:
    '''
    Will log into the MLS fantasy page with login_id and password inputted from the user.
    Returns a Chrome webdriver which can be used to logout.
    '''
    
    # open a chrome browser and go to the mls fantasy landing page
    driver = Chrome()
    driver.get(mls_fantasy_url)
    driver.find_element_by_link_text('LOG IN').click()
    # send in login id and password and go into the browser
    time.sleep(3)
    username = driver.find_element_by_name('username')
    username.clear()
    username.send_keys(login_id)
    time.sleep(3)
    passcode = driver.find_element_by_name('password')
    passcode.clear()
    passcode.send_keys(password)
    
    driver.find_element_by_class_name('gigya-input-submit').click()
    time.sleep(2)
    driver.find_element_by_link_text('STATS CENTER').click()
    
    return driver

In [4]:
def logout(web_driver: Chrome) -> None:
    '''
    Will logout out of the fantasy page when done with work.
    web_driver in this case is the browser that is launched with the mls fantasy page
    '''
    
    action = ActionChains(web_driver)
    
    first_menu = web_driver.find_element_by_class_name('my-account')
    action.move_to_element(first_menu).perform()
    time.sleep(2)
    
    second_menu = web_driver.find_element_by_css_selector('.fa.fa-power-off')
    action.move_to_element(second_menu)
    time.sleep(2)
    
    second_menu.click()

In [5]:
# urls, passwords, team names
mls_fantasy_url = "https://fantasy.mlssoccer.com/#"
login_id = 'rsherer@gmail.com'
pwd = 'fakepassword'

teams = {1: 'Atlanta United FC', 
 2: 'Chicago Fire FC', 
 3: 'FC Cincinnati', 
 4: 'Columbus Crew SC', 
 5: 'D.C. United', 
 6: 'Inter Miami CF', 
 7: 'Montreal Impact', 
 8: 'New England Revolution', 
 9: 'New York City FC', 
 10: 'New York Red Bulls', 
 11: 'Orlando City SC', 
 12: 'Philadelphia Union', 
 13: 'Toronto FC', 
 14: 'Colorado Rapids', 
 15: 'FC Dallas', 
 16: 'Houston Dynamo', 
 17: 'LA Galaxy', 
 18: 'Los Angeles FC', 
 19: 'Minnesota United FC', 
 20: 'Nashville SC', 
 21: 'Portland Timbers', 
 22: 'Real Salt Lake', 
 23: 'San Jose Earthquakes', 
 24: 'Seattle Sounders FC', 
 25: 'Sporting Kansas City', 
 26: 'Vancouver Whitecaps FC'}

In [13]:
driver = mls_fantasy_login(login_id, pwd)

In [6]:
logout(driver)

In [6]:
def get_player_ids(web_driver: Chrome) -> List[List[str]]:
    '''Function to build a List of the players in MLS, with their MLS Soccer fantasy ID as the key, then
    a list of their name, and the team the play for as the value.
    
    The web_driver must be one that is logged into the MLS site, else the stat center can't be accessed.
    '''
    #player_ids = {}
    
    player_ids = []
    
    select_team = Select(web_driver.find_element_by_id('js-filter-squads'))
    #select_team.select_by_visible_text(teams[1])
    
    for team in range(1, 27):
        select_team.select_by_visible_text(teams[team])

        html = web_driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.select('a.player-name.js-player-modal'):
            player_ids.append([tag['data-player_id'],
                              ' '.join(tag.text.strip('\n').split()[:-6]),
                              teams[team]])
#             player_ids[tag['data-player_id']] = \
#                 [' '.join(tag.text.strip('\n').split()[:-6]), teams[team]]

    return player_ids


In [27]:
with open('../data/mls_player_ids.csv') as f:
    player_ids = f.read().splitlines()

In [28]:
player_ids

['231969,E. Barco,Atlanta United FC',
 '122342,E. Hyndman,Atlanta United FC',
 '170797,P. Martinez,Atlanta United FC',
 '57769,F. Meza,Atlanta United FC',
 '193891,F. Escobar,Atlanta United FC',
 '41705,B. Guzan,Atlanta United FC',
 '194102,E. Remedi,Atlanta United FC',
 '158536,A. Walkes,Atlanta United FC',
 '201651,J. Mulraney,Atlanta United FC',
 '178386,B. Lennon,Atlanta United FC',
 '426797,G. Campbell,Atlanta United FC',
 '111686,J. Martinez,Atlanta United FC',
 '152253,A. Jahn,Atlanta United FC',
 '41551,J. Larentowicz,Atlanta United FC',
 '200735,M. Adams,Atlanta United FC',
 '426771,G. Bello,Atlanta United FC',
 '220142,A. Carleton,Atlanta United FC',
 '53477,E. Castillo,Atlanta United FC',
 '169644,M. Castro,Atlanta United FC',
 '227656,L. Fernando,Atlanta United FC',
 '234618,J. Gallagher,Atlanta United FC',
 '156919,A. Kann,Atlanta United FC',
 '249242,L. Kunga,Atlanta United FC',
 '205550,B. Moore,Atlanta United FC',
 '241500,M. Robinson,Atlanta United FC',
 '221678,M. Ros

In [29]:
len(player_ids)

712

In [31]:
player_ids[149]

'34160,R. Torres,Inter Miami CF'

In [18]:
mls_players = pd.read_csv('../data/mls_player_ids.csv', names=['id', 'player', 'team'])

In [21]:
mls_players.head()

Unnamed: 0,id,player,team
0,231969,E. Barco,Atlanta United FC
1,122342,E. Hyndman,Atlanta United FC
2,170797,P. Martinez,Atlanta United FC
3,57769,F. Meza,Atlanta United FC
4,193891,F. Escobar,Atlanta United FC


In [58]:
np.array(mls_players)[633]

array([98899, 'R. Ruidiaz', 'Seattle Sounders FC'], dtype=object)

In [1]:
len(mls_players)

NameError: name 'mls_players' is not defined

In [223]:
# with open('mls_player_ids.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_players)

In [35]:
# can use a generic page link, and concat each player's id to the end to pull up their page and scrape stats
page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

In [50]:
# taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
# that page. Can use this to cycle through all the player pages to amass weekly stats
driver.get(page_link + str(np.array(mls_players)[242][0]))

In [51]:
# on the specific player page, create a page object for beautifulsoup to parse for the content
html_players = driver.page_source
soup_players = BeautifulSoup(html_players, 'html.parser')

In [52]:
# using the beautiful soup object to get the specific player details. will use this over and over to scrape and
# store all the players data

#table = soup_players.select('div.row-table') # this provides game by game data
table = soup_players.select('div.profile-top-stats') # this provides top table stats
#table = soup_players.select('div.player-info-wrapper') # this provides meta data like salary, team, position
table_text = [stats.text for stats in table]

In [53]:
table_text, pds.clean_data(table_text[0])

(['\n\n\n\n\nGAMES PLAYED\n1\n\n\nAVG FANTASY PTS\n-2\n\n\nTOTAL FANTASY PTS\n-2\n\n\nLAST WK FANTASY PTS\nDNP\n\n\n\n\n3 WK AVG\n1\n\n\n5 WK AVG\n1.8\n\n\nHIGH SCORE\n-2\n\n\nLOW SCORE\n-2\n\n\n\n\nOWNED BY\n1\n\n\n$/POINT\n $-2,650K\n\n\nRD 2 RANK\n0\n\n\nSEASON RANK\n721\n\n\n\n\n'],
 ['GAMES PLAYED',
  '1',
  'AVG FANTASY PTS',
  '-',
  '2',
  'TOTAL FANTASY PTS',
  '-',
  '2',
  'LAST WK FANTASY PTS',
  'DNP',
  '3 WK AVG',
  '1',
  '5 WK AVG',
  '1.8',
  'HIGH SCORE',
  '-',
  '2',
  'LOW SCORE',
  '-',
  '2',
  'OWNED BY',
  '1',
  '$/POINT',
  ' $-2,650K',
  'RD 2 RANK',
  '0',
  'SEASON RANK',
  '721'])

In [54]:
pds.clean_data(table_text[0])

['GAMES PLAYED',
 '1',
 'AVG FANTASY PTS',
 '-',
 '2',
 'TOTAL FANTASY PTS',
 '-',
 '2',
 'LAST WK FANTASY PTS',
 'DNP',
 '3 WK AVG',
 '1',
 '5 WK AVG',
 '1.8',
 'HIGH SCORE',
 '-',
 '2',
 'LOW SCORE',
 '-',
 '2',
 'OWNED BY',
 '1',
 '$/POINT',
 ' $-2,650K',
 'RD 2 RANK',
 '0',
 'SEASON RANK',
 '721']

In [55]:
# 'owned by' refers to the percent of all fantasy owners with the player on their team

pds.clean_data(table_text[0])[16:18] 

['2', 'LOW SCORE']

In [56]:
len(pds.clean_data(table_text[0]))

28

In [187]:
# creating hard-coded specific row for Miguel Ibarra since his web page is broken
miguel_ibarra_1 = mls_players[649] + pds.clean_data(table_text[1]) + ['3'] + pds.clean_data(table_text[38])[0::2]

In [188]:
# creating hard_coded specific row for Miguel Ibarra since his web page is broken
miguel_ibarra_2 = mls_players[649] + pds.clean_data(table_text[2]) + ['1'] + pds.clean_data(table_text[39])[0::2]

In [44]:
player_columns = pds.clean_data(table_text[0]) + pds.clean_data(table_text[37])

In [46]:
player_columns.insert(1,'HOME_AWAY')

29

In [51]:
player_columns = ['ID', 'NAME', 'TEAM'] + player_columns

In [52]:
len(player_columns)

32

In [53]:
player_columns

['ID',
 'NAME',
 'TEAM',
 'RD',
 'HOME_AWAY',
 'OPPONENT',
 'PTS',
 'MIN',
 'GF',
 'A',
 'CS',
 'PS',
 'PE',
 'PM',
 'GA',
 'SV',
 'Y',
 'R',
 'OG',
 'T',
 'P',
 'KP',
 'CRS',
 'BC',
 'CL',
 'BLK',
 'INT',
 'BR',
 'ELG',
 'OGA',
 'SH',
 'WF']

In [None]:
mls_player_ids = [player[0] for player in mls_players]

In [None]:
mls_player_ids[0]

In [None]:
mls_players[0][0], mls_players[0][1], mls_players[0][2]

In [261]:
# todo - refactor function to take appropriate div class, based on scraping for weekly data, updated metadata,
# or updated top stats data

def get_all_player_stats(web_driver: Chrome, 
                         player_ids: List[List[str]], 
                         week_first: int, 
                         week_last: int) -> List[List[str]]:
    '''Function to go to the web and pull all players and stats for the players, by week, from the MLS
    Fantasy League website. 
    Must use a driver that is logged in to the site.
    Must use the full mls_player_ids which includes ID, player's name, and team for each player.
    
    For player data, use string 'div.row-table'.
    For player metadata, use string 'div.player-info-wrapper'.
    For player top stats, use string 'div.profile-top-stats'.
    '''
    player_stats = []
    
    page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

    # taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
    # that page. Can use this to cycle through all the player pages to amass weekly stats
    for player in player_ids:
        #print(page_link)
        #print(player[0])
        web_driver.get(page_link + player[0])
        time.sleep(5)
        
        # on the specific player page, create a page object for beautifulsoup to parse for the content
        html = web_driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # using the beautiful soup object to get the specific player details. will use this over and over to scrape and
        # store all the players data
        table = soup.select('div.row-table')
        table_text = [stats.text for stats in table]
        
        # in table_text, index 1 is the first game of the season in terms of info on the game, and index 38
        # (so index i + 37) is the associated per category stats for the match
        # each row will have the player's id, player name, team, information regarding the specific match, and then
        # respective category totals for that match
        for week in range(week_first, week_last + 1):
            if len(player_stats) % 50 == 0:
                print(f"Scraped {round(50 * len(player_stats) / len(player_ids), 2)}% so far")
            player_stats.append([player[0]] +
                                [player[1]] +
                                [player[2]] +
                                pds.update_negative_scores(pds.clean_data(table_text[week])) + 
                                [stat for stat in pds.clean_data(table_text[week + 37])[0::2]])
            
    return player_stats

In [209]:
mls_player_stats = get_all_player_stats(driver, mls_players, 1, 2)

Scraped 0.0% so far
Scraped 3.51% so far
Scraped 7.02% so far
Scraped 10.53% so far
Scraped 14.04% so far
Scraped 17.56% so far
Scraped 21.07% so far
Scraped 24.58% so far


IndexError: list index out of range

In [11]:
# mls_player_stats_100 = get_all_player_stats(driver, mls_players[:100], 1, 2)

Scraped 0.0% so far
Scraped 50.0% so far
Scraped 100.0% so far
Scraped 150.0% so far


In [25]:
# mls_player_stats_300 = get_all_player_stats(driver, mls_players[100:300], 1, 2)

Scraped 0.0% so far
Scraped 12.5% so far
Scraped 25.0% so far
Scraped 37.5% so far
Scraped 50.0% so far
Scraped 62.5% so far
Scraped 75.0% so far
Scraped 87.5% so far


In [27]:
# with open('mls_player_stats_300.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_300)

In [59]:
# mls_player_stats_500 = get_all_player_stats(driver, mls_players[300:500], 1, 2)

Scraped 0.0% so far
Scraped 12.5% so far
Scraped 25.0% so far
Scraped 37.5% so far
Scraped 50.0% so far
Scraped 62.5% so far
Scraped 75.0% so far
Scraped 87.5% so far


In [60]:
len(mls_player_stats_500)

400

In [61]:
# with open('mls_player_stats_500.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_500)

In [65]:
# mls_player_stats_600 = get_all_player_stats(driver, mls_players[500:600], 1, 2)

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [66]:
len(mls_player_stats_600)

200

In [67]:
# with open('mls_player_stats_600.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_600)

In [83]:
# mls_player_stats_rest = get_all_player_stats(driver, mls_players[600:649] + mls_players[650:], 1, 2)

Scraped 0.0% so far
Scraped 22.52% so far
Scraped 45.05% so far
Scraped 67.57% so far
Scraped 90.09% so far


In [179]:
# adding in hard-coded miguel ibarra rows given the omitted data on his webpage
# mls_player_stats_rest.append(miguel_ibarra_1)
# mls_player_stats_rest.append(miguel_ibarra_2)

In [185]:
len(mls_player_stats_rest)

224

In [186]:
# with open('mls_player_stats_rest.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_rest)

In [22]:
metadata = pd.read_csv('../data/mls_player_metadata_all.csv', names=['id', 'name', 'team', 'position', 'salary'])

In [23]:
metadata.head()

Unnamed: 0,id,name,team,position,salary
0,231969,E. Barco,Atlanta United FC,M,10.5
1,122342,E. Hyndman,Atlanta United FC,M,9.5
2,170797,P. Martinez,Atlanta United FC,M,9.3
3,57769,F. Meza,Atlanta United FC,D,6.9
4,193891,F. Escobar,Atlanta United FC,D,6.9


In [25]:
stats_column = ['ID','NAME',
 'TEAM','RD', 'HOME_AWAY', 'OPPONENT', 'PTS', 'MIN', 'GF', 'A', 'CS', 'PS', 'PE', 'PM', 'GA', 'SV', 'Y', 'R', 
'OG', 'T', 'P', 'KP', 'CRS', 'BC', 'CL', 'BLK', 'INT', 'BR', 'ELG', 'OGA', 'SH', 'WF']

In [26]:
stats = pd.read_csv('../data/mls_player_stats_all.csv', names=stats_column)

In [27]:
stats.head()

Unnamed: 0,ID,NAME,TEAM,RD,HOME_AWAY,OPPONENT,PTS,MIN,GF,A,...,CRS,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF
0,231969,E. Barco,Atlanta United FC,1,@,Nashville SC,7,90,1,0,...,0,0,0,0,0,3,0,0,2,3
1,231969,E. Barco,Atlanta United FC,2,vs,FC Cincinnati,13,90,1,1,...,0,0,0,0,0,5,0,0,4,5
2,122342,E. Hyndman,Atlanta United FC,1,@,Nashville SC,8,90,1,0,...,0,0,0,0,1,3,0,0,1,0
3,122342,E. Hyndman,Atlanta United FC,2,vs,FC Cincinnati,9,90,1,0,...,0,1,0,1,2,4,0,0,1,0
4,170797,P. Martinez,Atlanta United FC,1,@,Nashville SC,3,81,0,0,...,0,0,1,0,0,3,0,0,1,4


In [287]:
#Now get all the player specific data, id, player name, team, position and current salary

def get_all_player_meta_data(web_driver: Chrome, 
                         player_ids: List[List[str]]) -> List[List[str]]:
    '''Function to go to the web and pull all players and stats for the players, by week, from the MLS
    Fantasy League website. 
    Must use a driver that is logged in to the site.
    Must use the full mls_player_ids which includes ID, player's name, and team for each player.
    
    For player data, use string 'div.row-table'.
    For player metadata, use string 'div.player-info-wrapper'.
    For player top stats, use string 'div.profile-top-stats'.'''
    
    player_data = []
    
    page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

    # taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
    # that page. Can use this to cycle through all the player pages to amass weekly stats
    for player in player_ids:
        #print(page_link)
        #print(player[0])
        web_driver.get(page_link + player[0])
        time.sleep(5)
        
        # on the specific player page, create a page object for beautifulsoup to parse for the content
        html = web_driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # using the beautiful soup object to get the specific player details. will use this over and over to scrape and
        # store all the players data
        player_metadata = soup.select('div.player-info-wrapper')
        metadata_text = [meta.text for meta in player_metadata]
        # now we go through and clean up the meta data, and include it in the new table with each player
        if len(player_data) % 25 == 0:
            print(f"Scraped {100 * round(len(player_data) / len(player_ids), 2)}% so far")
        player_data.append([player[0]] +
                   [player[1]] +
                   [player[2]] +
                   [pds.get_player_position(pds.clean_data(metadata_text[0]))] +
                   [pds.get_player_salary(pds.clean_data(metadata_text[0]))])
    return player_data

In [280]:
mls_meta_150 = get_all_player_meta_data(driver, mls_players[:150])

Scraped 0.0% so far
Scraped 0.67% so far
Scraped 1.33% so far


In [309]:
len(mls_meta_150)

150

In [303]:
mls_meta_150[:5]

[['231969', 'E. Barco', 'Atlanta United FC', 'M', 10.5],
 ['122342', 'E. Hyndman', 'Atlanta United FC', 'M', 9.5],
 ['170797', 'P. Martinez', 'Atlanta United FC', 'M', 9.3],
 ['57769', 'F. Meza', 'Atlanta United FC', 'D', 6.9],
 ['193891', 'F. Escobar', 'Atlanta United FC', 'D', 6.9]]

In [296]:
mls_meta_300 = get_all_player_meta_data(driver, mls_players[150:300])

Scraped 0.0% so far
Scraped 17.0% so far
Scraped 33.0% so far
Scraped 50.0% so far
Scraped 67.0% so far
Scraped 83.0% so far


In [308]:
len(mls_meta_300)

150

In [304]:
mls_meta_450 = get_all_player_meta_data(driver, mls_players[300:450])

Scraped 0.0% so far
Scraped 17.0% so far
Scraped 33.0% so far
Scraped 50.0% so far
Scraped 67.0% so far
Scraped 83.0% so far


In [305]:
len(mls_meta_450), mls_meta_450[:5]

(150,
 [['223964', 'J. Glesnes', 'Philadelphia Union', 'D', 5.2],
  ['163906', 'A. Blake', 'Philadelphia Union', 'G', 6.9],
  ['110575', 'W. Creavalle', 'Philadelphia Union', 'M', 5.3],
  ['61770', 'A. Bedoya', 'Philadelphia Union', 'M', 6.0],
  ['477965', 'J. Martinez', 'Philadelphia Union', 'M', 5.3]])

In [306]:
mls_meta_600 = get_all_player_meta_data(driver, mls_players[450: 600])

Scraped 0.0% so far
Scraped 17.0% so far
Scraped 33.0% so far
Scraped 50.0% so far
Scraped 67.0% so far
Scraped 83.0% so far


In [307]:
len(mls_meta_600)

150

In [310]:
mls_meta_rest = get_all_player_meta_data(driver, mls_players[600:])

Scraped 0.0% so far
Scraped 22.0% so far
Scraped 45.0% so far
Scraped 67.0% so far
Scraped 89.0% so far


In [311]:
len(mls_meta_rest)

112

In [312]:
mls_meta_rest[:5]

[['427221', 'J. Vazquez', 'Real Salt Lake', 'F', 4.0],
 ['60862', 'O. Alanis', 'San Jose Earthquakes', 'D', 8.0],
 ['88940', 'Vako', 'San Jose Earthquakes', 'M', 8.8],
 ['170799', 'C. Espinoza', 'San Jose Earthquakes', 'M', 9.6],
 ['55111', 'A. Rios', 'San Jose Earthquakes', 'F', 7.5]]

In [313]:
mls_meta_all = mls_meta_150 + mls_meta_300 + mls_meta_450 + mls_meta_600 + mls_meta_rest

In [317]:
mls_meta_all[:5]

[['231969', 'E. Barco', 'Atlanta United FC', 'M', 10.5],
 ['122342', 'E. Hyndman', 'Atlanta United FC', 'M', 9.5],
 ['170797', 'P. Martinez', 'Atlanta United FC', 'M', 9.3],
 ['57769', 'F. Meza', 'Atlanta United FC', 'D', 6.9],
 ['193891', 'F. Escobar', 'Atlanta United FC', 'D', 6.9]]

In [318]:
with open('mls_player_metadata_all.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(mls_meta_all)

In [319]:
pd.read_csv('../data/mls_player_metadata_all.csv', names=['ID', 'name', 'team', 'position', 'salary'])

Unnamed: 0,ID,name,team,position,salary
0,231969,E. Barco,Atlanta United FC,M,10.5
1,122342,E. Hyndman,Atlanta United FC,M,9.5
2,170797,P. Martinez,Atlanta United FC,M,9.3
3,57769,F. Meza,Atlanta United FC,D,6.9
4,193891,F. Escobar,Atlanta United FC,D,6.9
...,...,...,...,...,...
707,107672,B. Meredith,Vancouver Whitecaps FC,G,4.5
708,421351,P. Metcalfe,Vancouver Whitecaps FC,M,4.5
709,424258,G. Mukumbilwa,Vancouver Whitecaps FC,D,4.0
710,437697,D. Pecile,Vancouver Whitecaps FC,M,4.0


In [193]:
pd.read_csv('mls_player_stats_100.csv', names=player_columns).head()

Unnamed: 0,ID,NAME,TEAM,RD,HOME_AWAY,OPPONENT,PTS,MIN,GF,A,...,CRS,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF
0,231969,E. Barco,Atlanta United FC,1,@,Nashville SC,7,90,1,0,...,0,0,0,0,0,3,0,0,2,3
1,231969,E. Barco,Atlanta United FC,2,vs,FC Cincinnati,13,90,1,1,...,0,0,0,0,0,5,0,0,4,5
2,122342,E. Hyndman,Atlanta United FC,1,@,Nashville SC,8,90,1,0,...,0,0,0,0,1,3,0,0,1,0
3,122342,E. Hyndman,Atlanta United FC,2,vs,FC Cincinnati,9,90,1,0,...,0,1,0,1,2,4,0,0,1,0
4,170797,P. Martinez,Atlanta United FC,1,@,Nashville SC,3,81,0,0,...,0,0,1,0,0,3,0,0,1,4


In [210]:
all_players = mls_player_stats_100 + mls_player_stats_300 + mls_player_stats_500 + mls_player_stats_600 + mls_player_stats_rest

In [219]:
with open('mls_player_stats_all.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(all_players)

In [316]:
len(all_players[0])

32

In [220]:
pd.read_csv('mls_player_stats_all.csv', names=player_columns)

Unnamed: 0,ID,NAME,TEAM,RD,HOME_AWAY,OPPONENT,PTS,MIN,GF,A,...,CRS,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF
0,231969,E. Barco,Atlanta United FC,1,@,Nashville SC,7,90,1,0,...,0,0,0,0,0,3,0,0,2,3
1,231969,E. Barco,Atlanta United FC,2,vs,FC Cincinnati,13,90,1,1,...,0,0,0,0,0,5,0,0,4,5
2,122342,E. Hyndman,Atlanta United FC,1,@,Nashville SC,8,90,1,0,...,0,0,0,0,1,3,0,0,1,0
3,122342,E. Hyndman,Atlanta United FC,2,vs,FC Cincinnati,9,90,1,0,...,0,1,0,1,2,4,0,0,1,0
4,170797,P. Martinez,Atlanta United FC,1,@,Nashville SC,3,81,0,0,...,0,0,1,0,0,3,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,437697,D. Pecile,Vancouver Whitecaps FC,2,@,LA Galaxy,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1420,450421,R. Veselinovic,Vancouver Whitecaps FC,1,vs,Sporting Kansas City,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1421,450421,R. Veselinovic,Vancouver Whitecaps FC,2,@,LA Galaxy,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1422,110596,M. Ibarra,Seattle Sounders FC,1,vs,Chicago Fire FC,3,73,0,0,...,0,0,0,0,1,2,0,0,6,0


In [92]:
# to get player ids in a list format
with open('../data/mls_player_ids.csv', newline='') as f:
    reader = csv.reader(f)
    player_ids = list(reader)

In [101]:
len(player_ids)

712

In [102]:
player_ids[:5]

[['231969', 'E. Barco', 'Atlanta United FC'],
 ['122342', 'E. Hyndman', 'Atlanta United FC'],
 ['170797', 'P. Martinez', 'Atlanta United FC'],
 ['57769', 'F. Meza', 'Atlanta United FC'],
 ['193891', 'F. Escobar', 'Atlanta United FC']]

In [117]:
# todo - refactor function to take appropriate div class, based on scraping for weekly data, updated metadata,
# or updated top stats data

def get_all_player_top_stats(web_driver: Chrome, 
                         player_ids: List[List[str]]) -> List[List[str]]:
    '''Function to go to the web and pull all players and stats for the players, by week, from the MLS
    Fantasy League website. 
    Must use a driver that is logged in to the site.
    Must use the full mls_player_ids which includes ID, player's name, and team for each player.
    
    For player data, use string 'div.row-table'.
    For player metadata, use string 'div.player-info-wrapper'.
    For player top stats, use string 'div.profile-top-stats'.
    '''
    player_stats = []
    
    page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

    # taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
    # that page. Can use this to cycle through all the player pages to amass weekly stats
    for player in player_ids:
        #print(page_link)
        #print(player[0])
        web_driver.get(page_link + player[0])
        time.sleep(5)
        
        # on the specific player page, create a page object for beautifulsoup to parse for the content
        html = web_driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # using the beautiful soup object to get the specific player details. will use this over and over to scrape and
        # store all the players data
        table = soup.select('div.profile-top-stats')
        table_text = [stats.text for stats in table]
        
        if len(player_stats) % 25 == 0:
            print(f"Scraped {round(100 * len(player_stats) / len(player_ids), 2)}% so far")
        player_stats.append([player[0]] +
                            [player[1]] +
                            [player[2]] +
                            pds.clean_data(table_text[0]))

    return player_stats

In [104]:
get_all_player_top_stats(driver, player_ids[:5])

Scraped 0.0% so far


[['231969',
  'E. Barco',
  'Atlanta United FC',
  'GAMES PLAYED',
  '2',
  'AVG FANTASY PTS',
  '10',
  'TOTAL FANTASY PTS',
  '20',
  'LAST WK FANTASY PTS',
  '13',
  '3 WK AVG',
  '9',
  '5 WK AVG',
  '7',
  'HIGH SCORE',
  '13',
  'LOW SCORE',
  '7',
  'OWNED BY',
  '23.61',
  '$/POINT',
  ' $525K',
  'RD 2 RANK',
  '3',
  'SEASON RANK',
  '5'],
 ['122342',
  'E. Hyndman',
  'Atlanta United FC',
  'GAMES PLAYED',
  '2',
  'AVG FANTASY PTS',
  '8.5',
  'TOTAL FANTASY PTS',
  '17',
  'LAST WK FANTASY PTS',
  '9',
  '3 WK AVG',
  '6.33',
  '5 WK AVG',
  '5.4',
  'HIGH SCORE',
  '9',
  'LOW SCORE',
  '8',
  'OWNED BY',
  '3.96',
  '$/POINT',
  ' $559K',
  'RD 2 RANK',
  '21',
  'SEASON RANK',
  '17'],
 ['170797',
  'P. Martinez',
  'Atlanta United FC',
  'GAMES PLAYED',
  '2',
  'AVG FANTASY PTS',
  '7',
  'TOTAL FANTASY PTS',
  '14',
  'LAST WK FANTASY PTS',
  '11',
  '3 WK AVG',
  '5.67',
  '5 WK AVG',
  '7',
  'HIGH SCORE',
  '11',
  'LOW SCORE',
  '3',
  'OWNED BY',
  '20.57',
  '$

In [107]:
top_stats_100 = get_all_player_top_stats(driver, player_ids[:100])

Scraped 0.0% so far
Scraped 12.5% so far
Scraped 25.0% so far
Scraped 37.5% so far


In [112]:
top_stats_100[0]

['231969',
 'E. Barco',
 'Atlanta United FC',
 'GAMES PLAYED',
 '2',
 'AVG FANTASY PTS',
 '10',
 'TOTAL FANTASY PTS',
 '20',
 'LAST WK FANTASY PTS',
 '13',
 '3 WK AVG',
 '9',
 '5 WK AVG',
 '7',
 'HIGH SCORE',
 '13',
 'LOW SCORE',
 '7',
 'OWNED BY',
 '23.61',
 '$/POINT',
 ' $525K',
 'RD 2 RANK',
 '3',
 'SEASON RANK',
 '5']

In [113]:
with open('../data/top_stats/mls_player_top_stats_100.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(top_stats_100)

In [116]:
top_stats_200 = get_all_player_top_stats(driver, player_ids[100:200])

Scraped 0.0% so far
Scraped 6.25% so far
Scraped 12.5% so far
Scraped 18.75% so far


In [118]:
top_stats_300 = get_all_player_top_stats(driver, player_ids[200:300])

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [120]:
top_stats_400 = get_all_player_top_stats(driver, player_ids[300:400])

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [124]:
top_stats_500 = get_all_player_top_stats(driver, player_ids[400:500])

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [125]:
top_stats_600 = get_all_player_top_stats(driver, player_ids[500:600])

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [127]:
top_stats_rest = get_all_player_top_stats(driver, player_ids[600:])

Scraped 0.0% so far
Scraped 22.32% so far
Scraped 44.64% so far
Scraped 66.96% so far
Scraped 89.29% so far


In [128]:
top_stats_all = top_stats_100 + top_stats_200 + top_stats_300 + top_stats_400 + top_stats_500 + top_stats_600 + top_stats_rest

In [129]:
with open('../data/top_stats/mls_player_top_stats_all.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(top_stats_all)

In [130]:
len(top_stats_all)

712

In [137]:
top_stats_all[630]

['172483',
 'J. Morris',
 'Seattle Sounders FC',
 'GAMES PLAYED',
 '2',
 'AVG FANTASY PTS',
 '7',
 'TOTAL FANTASY PTS',
 '14',
 'LAST WK FANTASY PTS',
 '2',
 '3 WK AVG',
 '5.33',
 '5 WK AVG',
 '5.2',
 'HIGH SCORE',
 '12',
 'LOW SCORE',
 '2',
 'OWNED BY',
 '20.14',
 '$/POINT',
 ' $729K',
 'RD 2 RANK',
 '216',
 'SEASON RANK',
 '32']

In [None]:
pds.clean_data(table_text[2])

In [None]:
pds.update_negative_scores(pds.clean_data(table_text[1]))

In [None]:
# this will provide the stats for each category for each week, this example is week2
[int(stat) for stat in pds.clean_data(table_text[39])[0::2]]

In [None]:
# this provides the points total based on the stats for each week, this example is week2
pds.clean_data(table_text[39])[1::2]

In [None]:
# game summary headline columns, opponent column split based on home ('vs') or away ('@')
pds.clean_data(table_text[0])

In [None]:
# game 1
print(f"game 1: {pds.clean_data(table_text[1])}")
# game 2
print(f"game 2: {pds.clean_data(table_text[2])}")
# game 34
print(f"game 34: {pds.clean_data(table_text[34])}")

In [None]:
# row 35 - summary row at the bottom of the page
print(f"row 35: {pds.clean_data(table_text[35])}"'\n')
# row 36 this is the header row for columns for the in-game stats
print(f"row 36: {pds.clean_data(table_text[36])}"'\n')
# row 37 this is the abbreviation header row for columns for the in-game stats
print(f"row 37: {pds.clean_data(table_text[37])}"'\n')
# row 38, this is in-game stats for game 1
print(f"row 38: {pds.clean_data(table_text[38])}")

In [None]:
pds.update_negative_scores(pds.clean_data(table_text[2])) + [int(stat) for stat in pds.clean_data(table_text[39])[0::2]]

In [None]:
# creates a list of points based on the row 
[int(point[1:-1]) for point in pds.clean_data(table_text[39])[1::2] if point.strip('-')]

In [None]:
# this is the sum of the points from the list above
sum([int(point[1:-1]) for point in pds.clean_data(table_text[39])[1::2] if point.strip('-')])

In [None]:
# combining the first part of the table, that shows week number, home ('vs') or away ('@'), and the opponent
# next is total points, then the number of minutes followed by all the stats for the week (ex: week 1)
pds.update_negative_scores(pds.clean_data(table_text[1])) + [int(entry) for entry in pds.clean_data(table_text[38])[0::2]]

In [None]:
# combining the first part of the table, that shows week number, home ('vs') or away ('@'), and the opponent
# next is total points, then the number of minutes followed by all the stats for the week (ex: week 2)
pds.update_negative_scores(pds.clean_data(table_text[2])) + [int(entry) for entry in pds.clean_data(table_text[39])[0::2]]