# Mine Player Contracts Per Year

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import json
import pprint
import time

import pandas as pd

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context



In [2]:
# read in contracts
df_contracts = pd.read_csv('contracts_tilJuly2024.csv')

In [3]:
CAP_FRIENDLY_BASE_URL = 'https://www.capfriendly.com'

def get_soup(url):
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [22]:
def scrape_player_contracts(player_link):
    # get soup
    player_soup = get_soup(CAP_FRIENDLY_BASE_URL + player_link)

    # parse contract data
    contracts = player_soup.find_all('div', {'class': 'cf_playerContract__wrapper'})
    player_contract_per_year = []
    for contract_soup in contracts:
        gm_link = get_signing_gm(contract_soup)
        sign_date = get_signing_date(contract_soup)
        notes = get_signing_notes(contract_soup)

        this_contract = []
        for row in contract_soup.find('tbody').find_all('tr'):
            this_contract_year = [player_link, gm_link, sign_date, notes]
            for cell in row.find_all('td'):
                this_contract_year.append(cell.get_text())
            this_contract.append(this_contract_year)
        player_contract_per_year.extend(this_contract)
    return player_contract_per_year

def get_signing_gm(contract_soup):
    try:
        return contract_soup.find('div', {'class': 'mb4 cb'}).find('a').get('href')
    except:
        return 'does-not-exist'

def get_signing_date(contract_soup):
    try:
        for div in contract_soup.find('div', {'class': 'cf_playerContract__meta'}).find_all('div'):
            if div.find('span').get_text() == 'Signing Date':
                date_str = div.get_text()
                return date_str[len('Signing Date: '):]
    except:
        return 'does-not-exist'
    
def get_signing_notes(contract_soup):
    try:
        return contract_soup.find('div', {'class': 'cf_playerContract__notes'}).get_text()
    except:
        return ''

In [23]:
player_link = '/players/patrik-nemeth'
scrape_player_contracts(player_link)

[['/players/patrik-nemeth',
  '/staff/joe-nieuwendyk',
  'Apr. 21, 2011',
  'Qualifying Offer: $850,500',
  '2011-12',
  '',
  '$793,333',
  '$900,000',
  '–',
  '$90,000',
  'ENTRY-LEVEL SLIDE'],
 ['/players/patrik-nemeth',
  '/staff/joe-nieuwendyk',
  'Apr. 21, 2011',
  'Qualifying Offer: $850,500',
  '2012-13',
  '',
  '$763,333',
  '$870,000',
  '$210,000',
  '$90,000',
  '$600,000',
  '$690,000',
  '$67,500'],
 ['/players/patrik-nemeth',
  '/staff/joe-nieuwendyk',
  'Apr. 21, 2011',
  'Qualifying Offer: $850,500',
  '2013-14',
  '',
  '$763,333',
  '$870,000',
  '$110,000',
  '$90,000',
  '$700,000',
  '$790,000',
  '$67,500'],
 ['/players/patrik-nemeth',
  '/staff/joe-nieuwendyk',
  'Apr. 21, 2011',
  'Qualifying Offer: $850,500',
  '2014-15',
  '',
  '$763,333',
  '$870,000',
  '$0',
  '$0',
  '$810,000',
  '$810,000',
  '$67,500'],
 ['/players/patrik-nemeth',
  '/staff/jim-nill',
  'Jun. 17, 2015',
  'Qualifying Offer: $945,000',
  '2015-16',
  '',
  '$900,000',
  '$900,000',
 

In [35]:
player_link = '/players/patrik-nemeth'
player_soup = get_soup(CAP_FRIENDLY_BASE_URL + player_link)

URLError: <urlopen error [Errno 60] Operation timed out>

In [None]:
def get_buyout_data(player_soup, player_link):
    all_buyout_years = []
    for row in player_soup.find('tbody', {'id': 'cont_x'}).find_all('tr')[1:]:
        buyout_year = [player_link]
        for cell in row.find_all('td'):
            buyout_year.append(cell.get_text())
        all_buyout_years.append(buyout_year)
    return all_buyout_years

In [34]:
get_buyout_data(player_soup, player_link)

[['/players/brad-richards', '2014-15', '', '$3,055,556', '$0'],
 ['/players/brad-richards', '2015-16', '', '$3,055,556', '$0'],
 ['/players/brad-richards', '2016-17', '', '$5,055,556', '$0'],
 ['/players/brad-richards', '2017-18', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2018-19', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2019-20', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2020-21', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2021-22', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2022-23', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2023-24', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2024-25', '', '$1,055,556', '$0'],
 ['/players/brad-richards', '2025-26', '', '$1,055,556', '$0']]

In [8]:
count = 0
all_player_contracts = []
for player_link in df_contracts['playerLink'].unique()[count:]:
    this_player_contracts = []
    try:
        this_player_contracts = scrape_player_contracts(player_link)
    except Exception as e:
        print(f'could not scrape link: {player_link}', e)
    all_player_contracts.extend(this_player_contracts)
    
    count += 1
    if count % 100 == 0:
        percent_remaining = round((1 - count / len(df_contracts['playerLink'].unique())) * 100, 1)
        print(f'through {count} players; {percent_remaining}% remaining')
        

through 100 players; 97.5% remaining
through 200 players; 95.0% remaining
through 300 players; 92.5% remaining
through 400 players; 90.0% remaining
through 500 players; 87.5% remaining
through 600 players; 85.0% remaining
through 700 players; 82.5% remaining
through 800 players; 80.0% remaining
through 900 players; 77.5% remaining
through 1000 players; 75.0% remaining
through 1100 players; 72.5% remaining
through 1200 players; 70.0% remaining
through 1300 players; 67.5% remaining
through 1400 players; 64.9% remaining
through 1500 players; 62.4% remaining
through 1600 players; 59.9% remaining
through 1700 players; 57.4% remaining
through 1800 players; 54.9% remaining
through 1900 players; 52.4% remaining
through 2000 players; 49.9% remaining
through 2100 players; 47.4% remaining
through 2200 players; 44.9% remaining
through 2300 players; 42.4% remaining
through 2400 players; 39.9% remaining
through 2500 players; 37.4% remaining
through 2600 players; 34.9% remaining
through 2700 players;

In [9]:
df = pd.DataFrame(all_player_contracts)
df.columns = ['playerLink', 'generalManagerLink', 'signDate', 'season', 'clause', 'capHit', 'aav',
              'potentialBonuses', 'signingBonuses', 'baseSalary', 'totalSalary', 'minorsSalary']
df.to_csv('playerContractPerYear_tilJuly2024.csv', index=False)
df.head(10)

Unnamed: 0,playerLink,generalManagerLink,signDate,season,clause,capHit,aav,potentialBonuses,signingBonuses,baseSalary,totalSalary,minorsSalary
0,/players/viktor-arvidsson,/staff/jeff-jackson,"Jul. 1, 2024",2024-25,,"$4,000,000","$4,000,000",$0,$0,"$4,000,000","$4,000,000","$4,000,000"
1,/players/viktor-arvidsson,/staff/jeff-jackson,"Jul. 1, 2024",2025-26,,"$4,000,000","$4,000,000",$0,$0,"$4,000,000","$4,000,000","$4,000,000"
2,/players/viktor-arvidsson,/staff/david-poile,"Jul. 15, 2014",2014-15,,"$631,667","$650,000","$35,000","$65,000","$550,000","$615,000","$65,000"
3,/players/viktor-arvidsson,/staff/david-poile,"Jul. 15, 2014",2015-16,,"$631,667","$650,000","$10,000","$65,000","$575,000","$640,000","$65,000"
4,/players/viktor-arvidsson,/staff/david-poile,"Jul. 15, 2014",2016-17,,"$631,667","$650,000","$10,000","$65,000","$575,000","$640,000","$65,000"
5,/players/viktor-arvidsson,/staff/david-poile,"Jul. 22, 2017",2017-18,,"$4,250,000","$4,250,000",$0,$0,"$4,250,000","$4,250,000","$4,250,000"
6,/players/viktor-arvidsson,/staff/david-poile,"Jul. 22, 2017",2018-19,,"$4,250,000","$4,250,000",$0,$0,"$4,250,000","$4,250,000","$4,250,000"
7,/players/viktor-arvidsson,/staff/david-poile,"Jul. 22, 2017",2019-20,,"$4,250,000","$4,250,000",$0,$0,"$4,250,000","$4,250,000","$4,250,000"
8,/players/viktor-arvidsson,/staff/david-poile,"Jul. 22, 2017",2020-21,,"$4,250,000","$4,250,000",$0,$0,"$4,250,000","$4,250,000","$4,250,000"
9,/players/viktor-arvidsson,/staff/david-poile,"Jul. 22, 2017",2021-22,,"$4,250,000","$4,250,000",$0,$0,"$4,250,000","$4,250,000","$4,250,000"


------------------

In [10]:
df[df['playerLink']=='/players/mike-smith'].sort_values('season')

Unnamed: 0,playerLink,generalManagerLink,signDate,season,clause,capHit,aav,potentialBonuses,signingBonuses,baseSalary,totalSalary,minorsSalary
17126,/players/mike-smith,/staff/doug-armstrong,"Feb. 7, 2007",2007-08,,"$950,000","$950,000",$0,$0,"$950,000","$950,000","$950,000"
17127,/players/mike-smith,/staff/doug-armstrong,"Feb. 7, 2007",2008-09,,"$950,000","$950,000",$0,$0,"$950,000","$950,000","$950,000"
17128,/players/mike-smith,does-not-exist,"Jul. 14, 2008",2009-10,,"$2,200,000","$2,200,000",$0,$0,"$2,000,000","$2,000,000","$2,000,000"
17129,/players/mike-smith,does-not-exist,"Jul. 14, 2008",2010-11,,"$2,200,000","$2,200,000",$0,$0,"$2,400,000","$2,400,000","$2,400,000"
17130,/players/mike-smith,/staff/don-maloney,"Jul. 1, 2011",2011-12,,"$2,000,000","$2,000,000",$0,$0,"$2,000,000","$2,000,000","$2,000,000"
17131,/players/mike-smith,/staff/don-maloney,"Jul. 1, 2011",2012-13,,"$2,000,000","$2,000,000",$0,$0,"$2,000,000","$2,000,000","$2,000,000"
17132,/players/mike-smith,/staff/don-maloney,"Jun. 30, 2013",2013-14,NMC,"$5,666,667","$5,666,667",$0,$0,"$4,000,000","$4,000,000","$4,000,000"
17133,/players/mike-smith,/staff/don-maloney,"Jun. 30, 2013",2014-15,NMC,"$5,666,667","$5,666,667",$0,$0,"$6,000,000","$6,000,000","$6,000,000"
17134,/players/mike-smith,/staff/don-maloney,"Jun. 30, 2013",2015-16,NMC,"$5,666,667","$5,666,667",$0,$0,"$6,500,000","$6,500,000","$6,500,000"
17135,/players/mike-smith,/staff/don-maloney,"Jun. 30, 2013",2016-17,M-NTC,"$5,666,667","$5,666,667",$0,$0,"$6,500,000","$6,500,000","$6,500,000"
