In [1]:
import numpy as np
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains

from typing import List, Tuple, Dict
import sys
sys.path.append("../src/")
import player_data_scraper as pds

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def mls_fantasy_login(login_id: str,
                     password: str, 
                     mls_fantasy_url: str = "https://fantasy.mlssoccer.com/#") -> Chrome:
    '''
    Will log into the MLS fantasy page with login_id and password inputted from the user.
    Returns a Chrome webdriver which can be used to logout.
    '''
    
    # open a chrome browser and go to the mls fantasy landing page
    driver = Chrome()
    driver.get(mls_fantasy_url)
    driver.find_element_by_link_text('LOG IN').click()
    # send in login id and password and go into the browser
    time.sleep(3)
    username = driver.find_element_by_name('username')
    username.clear()
    username.send_keys(login_id)
    time.sleep(3)
    passcode = driver.find_element_by_name('password')
    passcode.clear()
    passcode.send_keys(password)
    
    driver.find_element_by_class_name('gigya-input-submit').click()
    time.sleep(2)
    driver.find_element_by_link_text('STATS CENTER').click()
    
    return driver

In [3]:
def logout(web_driver: Chrome) -> None:
    '''
    Will logout out of the fantasy page when done with work.
    web_driver in this case is the browser that is launched with the mls fantasy page
    '''
    
    action = ActionChains(web_driver)
    
    first_menu = web_driver.find_element_by_class_name('my-account')
    action.move_to_element(first_menu).perform()
    time.sleep(2)
    
    second_menu = web_driver.find_element_by_css_selector('.fa.fa-power-off')
    action.move_to_element(second_menu)
    time.sleep(2)
    
    second_menu.click()

In [4]:
# urls, passwords, team names
mls_fantasy_url = "https://fantasy.mlssoccer.com/#"
login_id = 'rsherer@gmail.com'
pwd = 'mls77Soccer'

teams = {1: 'Atlanta United FC', 
 2: 'Chicago Fire FC', 
 3: 'FC Cincinnati', 
 4: 'Columbus Crew SC', 
 5: 'D.C. United', 
 6: 'Inter Miami CF', 
 7: 'Montreal Impact', 
 8: 'New England Revolution', 
 9: 'New York City FC', 
 10: 'New York Red Bulls', 
 11: 'Orlando City SC', 
 12: 'Philadelphia Union', 
 13: 'Toronto FC', 
 14: 'Colorado Rapids', 
 15: 'FC Dallas', 
 16: 'Houston Dynamo', 
 17: 'LA Galaxy', 
 18: 'Los Angeles FC', 
 19: 'Minnesota United FC', 
 20: 'Nashville SC', 
 21: 'Portland Timbers', 
 22: 'Real Salt Lake', 
 23: 'San Jose Earthquakes', 
 24: 'Seattle Sounders FC', 
 25: 'Sporting Kansas City', 
 26: 'Vancouver Whitecaps FC'}

In [6]:
driver = mls_fantasy_login(login_id, pwd)

In [None]:
logout(driver)

In [7]:
def get_player_ids(web_driver: Chrome) -> List[List[str]]:
    '''Function to build a List of the players in MLS, with their MLS Soccer fantasy ID as the key, then
    a list of their name, and the team the play for as the value.
    
    The web_driver must be one that is logged into the MLS site, else the stat center can't be accessed.
    '''
    #player_ids = {}
    
    player_ids = []
    
    select_team = Select(web_driver.find_element_by_id('js-filter-squads'))
    #select_team.select_by_visible_text(teams[1])
    
    for team in range(1, 27):
        select_team.select_by_visible_text(teams[team])

        html = web_driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.select('a.player-name.js-player-modal'):
            player_ids.append([tag['data-player_id'],
                              ' '.join(tag.text.strip('\n').split()[:-6]),
                              teams[team]])
#             player_ids[tag['data-player_id']] = \
#                 [' '.join(tag.text.strip('\n').split()[:-6]), teams[team]]

    return player_ids


In [8]:
mls_players = get_player_ids(driver)

In [9]:
len(mls_players)

712

In [121]:
# can use a generic page link, and concat each player's id to the end to pull up their page and scrape stats
page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

In [145]:
# taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
# that page. Can use this to cycle through all the player pages to amass weekly stats
driver.get(page_link + mls_players[649][0])

In [146]:
# on the specific player page, create a page object for beautifulsoup to parse for the content
html_players = driver.page_source
soup_players = BeautifulSoup(html_players, 'html.parser')

In [147]:
# using the beautiful soup object to get the specific player details. will use this over and over to scrape and
# store all the players data
table = soup_players.select('div.row-table')
table_text = [stats.text for stats in table]

In [148]:
mls_players[649]

['110596', 'M. Ibarra', 'Seattle Sounders FC']

In [187]:
# creating hard-coded specific row for Miguel Ibarra since his web page is broken
miguel_ibarra_1 = mls_players[649] + pds.clean_data(table_text[1]) + ['3'] + pds.clean_data(table_text[38])[0::2]

In [188]:
# creating hard_coded specific row for Miguel Ibarra since his web page is broken
miguel_ibarra_2 = mls_players[649] + pds.clean_data(table_text[2]) + ['1'] + pds.clean_data(table_text[39])[0::2]

In [44]:
player_columns = pds.clean_data(table_text[0]) + pds.clean_data(table_text[37])

In [46]:
player_columns.insert(1,'HOME_AWAY')

29

In [51]:
player_columns = ['ID', 'NAME', 'TEAM'] + player_columns

In [52]:
len(player_columns)

32

In [53]:
player_columns

['ID',
 'NAME',
 'TEAM',
 'RD',
 'HOME_AWAY',
 'OPPONENT',
 'PTS',
 'MIN',
 'GF',
 'A',
 'CS',
 'PS',
 'PE',
 'PM',
 'GA',
 'SV',
 'Y',
 'R',
 'OG',
 'T',
 'P',
 'KP',
 'CRS',
 'BC',
 'CL',
 'BLK',
 'INT',
 'BR',
 'ELG',
 'OGA',
 'SH',
 'WF']

In [None]:
mls_player_ids = [player[0] for player in mls_players]

In [None]:
mls_player_ids[0]

In [None]:
mls_players[0][0], mls_players[0][1], mls_players[0][2]

In [207]:
def get_all_player_stats(web_driver: Chrome, 
                         player_ids: List[List[str]], 
                         week_first: int, 
                         week_last: int) -> List[List[str]]:
    '''Function to go to the web and pull all players and stats for the players, by week, from the MLS
    Fantasy League website. 
    Must use a driver that is logged in to the site.
    Must use the full mls_player_ids which includes ID, player's name, and team for each player.
    '''
    player_stats = []
    
    page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/'

    # taking the list of mls_players, and adding the ID to the end of the page_link string in order to navigate to
    # that page. Can use this to cycle through all the player pages to amass weekly stats
    for player in player_ids:
        #print(page_link)
        #print(player[0])
        driver.get(page_link + player[0])
        time.sleep(5)
        
        # on the specific player page, create a page object for beautifulsoup to parse for the content
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # using the beautiful soup object to get the specific player details. will use this over and over to scrape and
        # store all the players data
        table = soup.select('div.row-table')
        table_text = [stats.text for stats in table]
        
        # in table_text, index 1 is the first game of the season in terms of info on the game, and index 38
        # (so index i + 37) is the associated per category stats for the match
        # each row will have the player's id, player name, team, information regarding the specific match, and then
        # respective category totals for that match
        for week in range(week_first, week_last + 1):
            if len(player_stats) % 50 == 0:
                print(f"Scraped {round(50 * len(player_stats) / len(player_ids), 2)}% so far")
            player_stats.append([player[0]] +
                                [player[1]] +
                                [player[2]] +
                                pds.update_negative_scores(pds.clean_data(table_text[week])) + 
                                [stat for stat in pds.clean_data(table_text[week + 37])[0::2]])
            
    return player_stats

In [209]:
mls_player_stats = get_all_player_stats(driver, mls_players, 1, 2)

Scraped 0.0% so far
Scraped 3.51% so far
Scraped 7.02% so far
Scraped 10.53% so far
Scraped 14.04% so far
Scraped 17.56% so far
Scraped 21.07% so far
Scraped 24.58% so far


IndexError: list index out of range

In [11]:
# mls_player_stats_100 = get_all_player_stats(driver, mls_players[:100], 1, 2)

Scraped 0.0% so far
Scraped 50.0% so far
Scraped 100.0% so far
Scraped 150.0% so far


In [25]:
# mls_player_stats_300 = get_all_player_stats(driver, mls_players[100:300], 1, 2)

Scraped 0.0% so far
Scraped 12.5% so far
Scraped 25.0% so far
Scraped 37.5% so far
Scraped 50.0% so far
Scraped 62.5% so far
Scraped 75.0% so far
Scraped 87.5% so far


In [27]:
# with open('mls_player_stats_300.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_300)

In [59]:
# mls_player_stats_500 = get_all_player_stats(driver, mls_players[300:500], 1, 2)

Scraped 0.0% so far
Scraped 12.5% so far
Scraped 25.0% so far
Scraped 37.5% so far
Scraped 50.0% so far
Scraped 62.5% so far
Scraped 75.0% so far
Scraped 87.5% so far


In [60]:
len(mls_player_stats_500)

400

In [61]:
# with open('mls_player_stats_500.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_500)

In [65]:
# mls_player_stats_600 = get_all_player_stats(driver, mls_players[500:600], 1, 2)

Scraped 0.0% so far
Scraped 25.0% so far
Scraped 50.0% so far
Scraped 75.0% so far


In [66]:
len(mls_player_stats_600)

200

In [67]:
# with open('mls_player_stats_600.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_600)

In [83]:
# mls_player_stats_rest = get_all_player_stats(driver, mls_players[600:649] + mls_players[650:], 1, 2)

Scraped 0.0% so far
Scraped 22.52% so far
Scraped 45.05% so far
Scraped 67.57% so far
Scraped 90.09% so far


In [179]:
# adding in hard-coded miguel ibarra rows given the omitted data on his webpage
# mls_player_stats_rest.append(miguel_ibarra_1)
# mls_player_stats_rest.append(miguel_ibarra_2)

In [185]:
len(mls_player_stats_rest)

224

In [186]:
# with open('mls_player_stats_rest.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(mls_player_stats_rest)

In [193]:
pd.read_csv('mls_player_stats_100.csv', names=player_columns).head()

Unnamed: 0,ID,NAME,TEAM,RD,HOME_AWAY,OPPONENT,PTS,MIN,GF,A,...,CRS,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF
0,231969,E. Barco,Atlanta United FC,1,@,Nashville SC,7,90,1,0,...,0,0,0,0,0,3,0,0,2,3
1,231969,E. Barco,Atlanta United FC,2,vs,FC Cincinnati,13,90,1,1,...,0,0,0,0,0,5,0,0,4,5
2,122342,E. Hyndman,Atlanta United FC,1,@,Nashville SC,8,90,1,0,...,0,0,0,0,1,3,0,0,1,0
3,122342,E. Hyndman,Atlanta United FC,2,vs,FC Cincinnati,9,90,1,0,...,0,1,0,1,2,4,0,0,1,0
4,170797,P. Martinez,Atlanta United FC,1,@,Nashville SC,3,81,0,0,...,0,0,1,0,0,3,0,0,1,4


In [210]:
all_players = mls_player_stats_100 + mls_player_stats_300 + mls_player_stats_500 + mls_player_stats_600 + mls_player_stats_rest

In [219]:
with open('mls_player_stats_all.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(all_players)

In [214]:
len(all_players[0])

32

In [220]:
pd.read_csv('mls_player_stats_all.csv', names=player_columns)

Unnamed: 0,ID,NAME,TEAM,RD,HOME_AWAY,OPPONENT,PTS,MIN,GF,A,...,CRS,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF
0,231969,E. Barco,Atlanta United FC,1,@,Nashville SC,7,90,1,0,...,0,0,0,0,0,3,0,0,2,3
1,231969,E. Barco,Atlanta United FC,2,vs,FC Cincinnati,13,90,1,1,...,0,0,0,0,0,5,0,0,4,5
2,122342,E. Hyndman,Atlanta United FC,1,@,Nashville SC,8,90,1,0,...,0,0,0,0,1,3,0,0,1,0
3,122342,E. Hyndman,Atlanta United FC,2,vs,FC Cincinnati,9,90,1,0,...,0,1,0,1,2,4,0,0,1,0
4,170797,P. Martinez,Atlanta United FC,1,@,Nashville SC,3,81,0,0,...,0,0,1,0,0,3,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,437697,D. Pecile,Vancouver Whitecaps FC,2,@,LA Galaxy,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1420,450421,R. Veselinovic,Vancouver Whitecaps FC,1,vs,Sporting Kansas City,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1421,450421,R. Veselinovic,Vancouver Whitecaps FC,2,@,LA Galaxy,0,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1422,110596,M. Ibarra,Seattle Sounders FC,1,vs,Chicago Fire FC,3,73,0,0,...,0,0,0,0,1,2,0,0,6,0


In [None]:
pds.clean_data(table_text[2])

In [None]:
pds.update_negative_scores(pds.clean_data(table_text[1]))

In [None]:
# this will provide the stats for each category for each week, this example is week2
[int(stat) for stat in pds.clean_data(table_text[39])[0::2]]

In [None]:
# this provides the points total based on the stats for each week, this example is week2
pds.clean_data(table_text[39])[1::2]

In [None]:
# game summary headline columns, opponent column split based on home ('vs') or away ('@')
pds.clean_data(table_text[0])

In [None]:
# game 1
print(f"game 1: {pds.clean_data(table_text[1])}")
# game 2
print(f"game 2: {pds.clean_data(table_text[2])}")
# game 34
print(f"game 34: {pds.clean_data(table_text[34])}")

In [None]:
# row 35 - summary row at the bottom of the page
print(f"row 35: {pds.clean_data(table_text[35])}"'\n')
# row 36 this is the header row for columns for the in-game stats
print(f"row 36: {pds.clean_data(table_text[36])}"'\n')
# row 37 this is the abbreviation header row for columns for the in-game stats
print(f"row 37: {pds.clean_data(table_text[37])}"'\n')
# row 38, this is in-game stats for game 1
print(f"row 38: {pds.clean_data(table_text[38])}")

In [None]:
pds.update_negative_scores(pds.clean_data(table_text[2])) + [int(stat) for stat in pds.clean_data(table_text[39])[0::2]]

In [None]:
# creates a list of points based on the row 
[int(point[1:-1]) for point in pds.clean_data(table_text[39])[1::2] if point.strip('-')]

In [None]:
# this is the sum of the points from the list above
sum([int(point[1:-1]) for point in pds.clean_data(table_text[39])[1::2] if point.strip('-')])

In [None]:
# combining the first part of the table, that shows week number, home ('vs') or away ('@'), and the opponent
# next is total points, then the number of minutes followed by all the stats for the week (ex: week 1)
pds.update_negative_scores(pds.clean_data(table_text[1])) + [int(entry) for entry in pds.clean_data(table_text[38])[0::2]]

In [None]:
# combining the first part of the table, that shows week number, home ('vs') or away ('@'), and the opponent
# next is total points, then the number of minutes followed by all the stats for the week (ex: week 2)
pds.update_negative_scores(pds.clean_data(table_text[2])) + [int(entry) for entry in pds.clean_data(table_text[39])[0::2]]