# CapFriendly Miner
Using CapFriendly as our data source, we will mine as many player contracts/stats as possible. CapFriendly team contract data dates back to 2016. That gives us 5 full seasons of rosters to mine.

Additionally, CapFriendly tracks individual player contracts/stats since before 2000. This will give us ample data to experiment on.

In [300]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import json
import pprint

import pandas as pd

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Scrape Team Links
First, we need to gather a list of teams and links to CapFriendly team pages.

In [39]:
# get html, convert to bs4 object
url = 'https://www.capfriendly.com/archive/2022'
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

# find table that contains data
table_rows = soup.find('table', class_='sortablex tblcf tbl index').find('tbody').find_all('tr')

# scrape data into dict
team_and_link = {}
for row in table_rows:
    team_name = row.find('a').get_text()
    team_link = row.find('a').get('href')[:-4]
    team_and_link[team_name] = team_link


In [41]:
len(team_and_link)

32

### Scrape Player Links

In [133]:
player_and_link = set()

for team_name in team_and_link:
    # set the earliest season to search through
    earliest_season = 2015
    if team_name == 'Seattle Kraken':
        earliest_season = 2021
    elif team_name == 'Vegas Golden Knights':
        earliest_season = 2017
        
    for season in range(2022,earliest_season,-3):
        # get html, convert to bs4 object
        url = 'https://www.capfriendly.com' + team_and_link[team_name] + str(season)
        page = urlopen(url)
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, 'html.parser')

        # find table that contains data
        table_rows = soup.find_all('tr')

        # scrape data into dict
        for row in table_rows:
            if row.find('a') is not None:        
                player_link = row.find('a').get('href')
                player_and_link.add(player_link)

In [134]:
len(player_and_link)

2656

### Scrape Player Contracts/Stats

In [306]:
def get_player_info(soup, player_link):
    # get name
    name = soup.find('h1').get_text().title()
    if '#' in name:
        name = name[:name.find('#')-1]

    # get position
    position = soup.find_all('h6')[0].get_text()
    if position == 'Captain' or position == 'Alternate Captain':
        position = soup.find_all('h6')[1].get_text()

    # get characteristics (height, weight, birth date, ...)
    attributes = []
    for tag in soup.find('div', class_='indx_b l').find_all('div'):
        text = tag.get_text()
        if ':' in text:
            attributes.append(text[text.find(':')+1:].strip())

    for tag in soup.find('div', class_='indx_b l rel').find_all('div'):
        text = tag.get_text()
        if ':' in text:
            attributes.append(text[text.find(':')+1:].strip())

    for tag in soup.find('div', class_='indx_b rel l').find_all('div'):
        text = tag.get_text()
        if ':' in text:
            attributes.append(text[text.find(':')+1:].strip())

    # store player characteristics in a dict
    attrib_dict = {}
    attrib_dict['link'] = player_link
    attrib_dict['name'] = name
    attrib_dict['position'] = position
    attrib_dict['born'] = attributes[0]
    attrib_dict['birthplace'] = attributes[1]
    attrib_dict['nationality'] = attributes[2]
    attrib_dict['height'] = attributes[3]
    attrib_dict['weight'] = attributes[4]
    attrib_dict['handness'] = attributes[5]
    attrib_dict['draftYear'] = attributes[11]
    attrib_dict['draftOverall'] = attributes[12]
    attrib_dict['draftRound'] = attributes[13]

    return attrib_dict

In [352]:
def get_contract_info(soup, player_link):
    all_contracts_for_this_player = []
    
    # extract contract html
    contract_table = soup.find_all('div', class_='table_c contract_cont')
    for c in contract_table:
        # Entry level or standard contract?
        c_type = c.find('h6').get_text()

        # get contract length, expirary status, cap hit percentage, ...
        c_details = []
        for detail in c.find_all('div', class_='l cont_t mt4 mb2'):
            c_details.append(detail.get_text()[detail.get_text().find(':')+1:].strip())
        for detail in c.find_all('div', class_='l cont_t mb5'):
            c_details.append(detail.get_text()[detail.get_text().find(':')+1:].strip())

        # does contract contain a NMC/NTC?
        contains_clause = (len(c.find_all('div', class_='clause cntrct')) > 0)

        # store contract characteristics in a dict
        detail_dict = {}
        detail_dict['link'] = player_link
        detail_dict['type'] = c_type
        detail_dict['length'] = c_details[0]
        detail_dict['expiraryStatus'] = c_details[1]
        detail_dict['signingTeam'] = c_details[2]
        detail_dict['totalValue'] = c_details[3]
        detail_dict['capHitPercentage'] = c_details[4]
        detail_dict['signingDate'] = c_details[5]
        detail_dict['hasMoveClause'] = contains_clause

        # keep track of multiple contracts for each player
        all_contracts_for_this_player.append(detail_dict.copy())
    
    return all_contracts_for_this_player


In [324]:
def get_player_stats(soup, player_link):
    # check if this player is a goalie
    if soup.find_all('h6')[0].get_text() == 'Goaltender':
        return get_goalie_stats(soup, player_link)
    
    all_stats_for_this_player = []
    
    # extract stats html
    table_rows = soup.find('table', id='career_stats').find('tbody').find_all('tr')
    for row in table_rows[:-1]:
        season_stats = [col.get_text() for col in row.find_all('td')]
        
        if season_stats[0] != '':
            season = season_stats[0]

        stat_dict = {}
        stat_dict['link'] = player_link
        stat_dict['season'] = season
        stat_dict['team'] = season_stats[1]
        stat_dict['league'] = season_stats[2]

        # regular season
        stat_dict['gp'] = season_stats[4]
        stat_dict['g'] = season_stats[5]
        stat_dict['a'] = season_stats[6]
        stat_dict['p'] = season_stats[7]
        stat_dict['plusMinus'] = season_stats[8]
        stat_dict['pim'] = season_stats[9]

        # playoff stats
        stat_dict['playoff_gp'] = season_stats[12]
        stat_dict['playoff_g'] = season_stats[13]
        stat_dict['playoff_a'] = season_stats[14]
        stat_dict['playoff_p'] = season_stats[15]
        stat_dict['playoff_plusMinus'] = season_stats[16]
        stat_dict['playoff_pim'] = season_stats[17]

        # regular season advanced stats
        stat_dict['toi'] = season_stats[18]
        stat_dict['g_5v5'] = season_stats[19]
        stat_dict['ixG'] = season_stats[20]
        stat_dict['xG_diff_per60'] = season_stats[21]
        stat_dict['xG_diff_per60_rel'] = season_stats[22]
        stat_dict['corsi_diff_per60'] = season_stats[23]
        stat_dict['corsi_diff_per60_rel'] = season_stats[24]

        # regular season advanced stats
        stat_dict['playoff_toi'] = season_stats[26]
        stat_dict['playoff_g_5v5'] = season_stats[27]
        stat_dict['playoff_ixG'] = season_stats[28]
        stat_dict['playoff_xG_diff_per60'] = season_stats[29]
        stat_dict['playoff_xG_diff_per60_rel'] = season_stats[30]
        stat_dict['playoff_corsi_diff_per60'] = season_stats[31]
        stat_dict['playoff_corsi_diff_per60_rel'] = season_stats[32]
    
        # keep track of multiple seasons for this player
        all_stats_for_this_player.append(stat_dict.copy())
    
    return all_stats_for_this_player


def get_goalie_stats(soup, player_link):
    all_stats_for_this_player = []
    
    # extract stats html
    table_rows = soup.find('table', id='career_stats').find('tbody').find_all('tr')
    for row in table_rows[:-1]:
        season_stats = [col.get_text() for col in row.find_all('td')]
        
        if season_stats[0] != '':
            season = season_stats[0]

        stat_dict = {}
        stat_dict['link'] = player_link
        stat_dict['season'] = season
        stat_dict['team'] = season_stats[1]
        stat_dict['league'] = season_stats[2]

        # regular season
        stat_dict['gp'] = season_stats[4]
        stat_dict['gaa'] = season_stats[5]
        stat_dict['svPct'] = season_stats[6]

        # playoff stats
        stat_dict['playoff_gp'] = season_stats[9]
        stat_dict['playoff_gaa'] = season_stats[10]
        stat_dict['playoff_svPct'] = season_stats[11]

        # regular season advanced stats
        stat_dict['ga_per60'] = season_stats[12]
        stat_dict['xga_per60'] = season_stats[13]
        stat_dict['gsa_per60'] = season_stats[14]

        # regular season advanced stats
        stat_dict['playoff_ga_per60'] = season_stats[17]
        stat_dict['playoff_xga_per60'] = season_stats[18]
        stat_dict['playoff_gsa_per60'] = season_stats[19]
    
        # keep track of multiple seasons for this player
        all_stats_for_this_player.append(stat_dict.copy())
    
    return all_stats_for_this_player
    

In [360]:
player_info = []
contract_info = []
player_stats = []

count = 0
for player_link in player_and_link:
#     if count != 20:
#         count += 1
#         continue
    
    # get html, convert to bs4 object
    url = 'https://www.capfriendly.com' + player_link
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    # scrape characteristics
    try:
        player_info.append(get_player_info(soup, player_link))
        contract_info.extend(get_contract_info(soup, player_link))
        player_stats.extend(get_player_stats(soup, player_link))
        count += 1
    except:
        print('count: ' + str(count) + '; failed at ' + player_link)

print('mined ' + str(count) + ' players')

count: 114; failed at /players/grant-hutton
mined 2655 players


In [361]:
# split into skater and goalie stats
player_stats_skaters = [row for row in player_stats if 'g' in row]
player_stats_goalie = [row for row in player_stats if 'gaa' in row]


In [362]:
print(len(player_stats_skaters))
print(len(player_stats_goalie))

62550
7213


### Save Data to JSON files

In [363]:
# save player_info
with open('../data/player_info.json', 'w') as f:
    json.dump(player_info, f)

# save contract info
with open('../data/contract_info.json', 'w') as f:
    json.dump(contract_info, f)

# save skater stats
with open('../data/player_stats_skaters.json', 'w') as f:
    json.dump(player_stats_skaters, f)

# save goalie stats
with open('../data/player_stats_goalie.json', 'w') as f:
    json.dump(player_stats_goalie, f)

In [367]:
df = pd.DataFrame(contract_info)
df.head(10)

Unnamed: 0,link,type,length,expiraryStatus,signingTeam,totalValue,capHitPercentage,signingDate,hasMoveClause
0,/players/francois-beauchemin,STANDARD CONTRACT,2 YEARS,RFA,Columbus Blue Jackets,"$1,000,000",1.28,"Aug. 1, 2005",False
1,/players/francois-beauchemin,STANDARD CONTRACT (EXTENSION),2 YEARS,UFA,Anaheim Ducks,"$3,300,000",3.75,"Aug. 16, 2006",False
2,/players/francois-beauchemin,STANDARD CONTRACT,3 YEARS,UFA,Toronto Maple Leafs,"$11,400,000",6.69,"Jul. 6, 2009",True
3,/players/francois-beauchemin,STANDARD CONTRACT (EXTENSION),3 YEARS,UFA,Anaheim Ducks,"$10,500,000",5.83,"Jan. 20, 2012",True
4,/players/francois-beauchemin,35+ CONTRACT,3 YEARS,UFA,Colorado Avalanche,"$13,500,000",6.3,"Jul. 1, 2015",True
5,/players/francois-beauchemin,35+ CONTRACT,1 YEAR,UFA,Anaheim Ducks,"$1,700,000",1.33,"Aug. 21, 2017",True
6,/players/dougie-hamilton,STANDARD CONTRACT,7 YEARS,UFA,New Jersey Devils,"$63,000,000",11.04,"Jul. 28, 2021",True
7,/players/dougie-hamilton,ENTRY-LEVEL CONTRACT,3 YEARS,RFA,Boston Bruins,"$4,575,000",1.44,"Dec. 8, 2011",False
8,/players/dougie-hamilton,STANDARD CONTRACT (EXTENSION),6 YEARS,UFA,Calgary Flames,"$34,500,000",8.05,"Jun. 30, 2015",True
9,/players/brennan-menell,STANDARD CONTRACT,1 YEAR,RFA,Toronto Maple Leafs,"$750,000",0.92,"Jul. 31, 2021",False
