# Mine Player Statistics

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import json
import pprint
import time

import pandas as pd

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context



In [2]:
# read in contracts
df_contracts = pd.read_csv('contracts_tilJuly2024.csv')

In [3]:
CAP_FRIENDLY_BASE_URL = 'https://www.capfriendly.com'

def get_soup(url):
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [4]:
def scrape_player_stats(player_link):
    # get soup
    player_soup = get_soup(CAP_FRIENDLY_BASE_URL + player_link)
    
    # parse tables
    this_player_stats = []
    this_player_position = get_position(player_soup)
    for row in player_soup.find('table', {'id': 'career_stats'}).find('tbody').find_all('tr'):
        player_stats = [player_link, this_player_position]
        for cell in row.find_all('td'):
            clazz = cell.get('class')
            if clazz is not None:
                if 'hide' in clazz:
                    continue
            player_stats.append(cell.get_text())
        this_player_stats.append(player_stats)
    return this_player_stats

def get_position(player_soup):
    for h6 in player_soup.find_all('h6', {'class': 'c'}):
        for position in ['centre', 'wing', 'defense', 'goaltender']:
            if position in h6.get_text().lower():
                return h6.get_text()

def fill_in_missing_seasons(this_player_stats):
    cur_season = None
    for row in this_player_stats:
        # set cur_season to first row season
        if cur_season is None:
            cur_season = row[2]
            continue

        # check if season is empty, set to value of previous row
        if row[2] == '':
            row[2] = cur_season
        else:
            cur_season = row[2]
    return this_player_stats

In [5]:
count = 0
all_player_stats = []
for player_link in df_contracts['playerLink'].unique()[count:]:
    this_player_stats = []
    try:
        this_player_stats = scrape_player_stats(player_link)
        this_player_stats = fill_in_missing_seasons(this_player_stats)
    except Exception as e:
        print(f'could not scrape link: {player_link}', e)
    all_player_stats.extend(this_player_stats)
    
    count += 1
    if count % 100 == 0:
        percent_remaining = round((1 - count / len(df_contracts['playerLink'].unique())) * 100, 1)
        print(f'through {count} players; {percent_remaining}% remaining')
        

through 100 players; 97.5% remaining
through 200 players; 95.0% remaining
through 300 players; 92.5% remaining
through 400 players; 90.0% remaining
through 500 players; 87.5% remaining
through 600 players; 85.0% remaining
through 700 players; 82.5% remaining
through 800 players; 80.0% remaining
through 900 players; 77.5% remaining
through 1000 players; 75.0% remaining
through 1100 players; 72.5% remaining
through 1200 players; 70.0% remaining
through 1300 players; 67.5% remaining
through 1400 players; 64.9% remaining
through 1500 players; 62.4% remaining
through 1600 players; 59.9% remaining
through 1700 players; 57.4% remaining
through 1800 players; 54.9% remaining
through 1900 players; 52.4% remaining
through 2000 players; 49.9% remaining
through 2100 players; 47.4% remaining
through 2200 players; 44.9% remaining
through 2300 players; 42.4% remaining
through 2400 players; 39.9% remaining
through 2500 players; 37.4% remaining
through 2600 players; 34.9% remaining
through 2700 players;

In [6]:
df = pd.DataFrame(all_player_stats)
df.columns = ['playerLink', 'position', 'season', 'teamName', 'leagueName', 'nullCol1', 
              'gamesPlayed', 'goals', 'assists', 'points', 'plusMinus', 'pim', 'nullCol2', 'nullCol3',
              'gamesPlayedPlayoff', 'goalsPlayoff', 'assistsPlayoff', 'pointsPlayoff', 'plusMinusPlayoff', 'pimPlayoff']
df.to_csv('playerStats_tilJuly2024.csv', index=False)
df.head(10)

Unnamed: 0,playerLink,position,season,teamName,leagueName,nullCol1,gamesPlayed,goals,assists,points,plusMinus,pim,nullCol2,nullCol3,gamesPlayedPlayoff,goalsPlayoff,assistsPlayoff,pointsPlayoff,plusMinusPlayoff,pimPlayoff
0,/players/viktor-arvidsson,"Right Wing, Left Wing",2008-09,Sweden U16 (all),International-Jr,,3,2,2,4,-,0,|,,,,,,,
1,/players/viktor-arvidsson,"Right Wing, Left Wing",2008-09,Skellefteå AIK J18,J18 Nationell,,14,7,7,14,-,16,|,PLAYOFFS,5.0,1.0,2.0,3.0,,4.0
2,/players/viktor-arvidsson,"Right Wing, Left Wing",2009-10,Sweden U17 (all),International-Jr,,9,4,5,9,-,6,|,,,,,,,
3,/players/viktor-arvidsson,"Right Wing, Left Wing",2009-10,Sweden U17,WHC-17,,6,2,2,4,-,2,|,,,,,,,
4,/players/viktor-arvidsson,"Right Wing, Left Wing",2009-10,Skellefteå AIK J18,J18 Region,,22,32,30,62,-,34,|,,,,,,,
5,/players/viktor-arvidsson,"Right Wing, Left Wing",2009-10,Skellefteå AIK J20,J20 Nationell,,2,0,1,1,-,2,|,PLAYOFFS,2.0,1.0,0.0,1.0,,0.0
6,/players/viktor-arvidsson,"Right Wing, Left Wing",2009-10,Skellefteå AIK J18,J18 Nationell,,18,20,18,38,-,26,|,PLAYOFFS,3.0,1.0,1.0,2.0,,0.0
7,/players/viktor-arvidsson,"Right Wing, Left Wing",2010-11,Skellefteå AIK,SHL,,3,0,0,0,0,0,|,,,,,,,
8,/players/viktor-arvidsson,"Right Wing, Left Wing",2010-11,Sweden U18 (all),International-Jr,,19,9,6,15,2,18,|,,,,,,,
9,/players/viktor-arvidsson,"Right Wing, Left Wing",2010-11,Sweden U18,WJC-18,,5,0,1,1,-1,8,|,,,,,,,


------------------

In [8]:
df[df['playerLink']=='/players/nathan-mackinnon'].sort_values('season')

Unnamed: 0,playerLink,position,season,teamName,leagueName,nullCol1,gamesPlayed,goals,assists,points,plusMinus,pim,nullCol2,nullCol3,gamesPlayedPlayoff,goalsPlayoff,assistsPlayoff,pointsPlayoff,plusMinusPlayoff,pimPlayoff
17807,/players/nathan-mackinnon,Centre,2009-10,Shattuck St. Marys Bantam T1,14U AAA,,58,54,47,101,-,56,|,,,,,,,
17808,/players/nathan-mackinnon,Centre,2010-11,Canada Atlantic U17,WHC-17,,5,5,3,8,-,0,|,,,,,,,
17809,/players/nathan-mackinnon,Centre,2010-11,Team Nova Scotia,CWG,,7,8,3,11,-,8,|,,,,,,,
17810,/players/nathan-mackinnon,Centre,2010-11,Shattuck St. Marys U16,16U AAA,,40,45,48,93,-,72,|,,,,,,,
17811,/players/nathan-mackinnon,Centre,2011-12,Halifax Mooseheads,QMJHL,,58,31,47,78,11,45,|,PLAYOFFS,17.0,13.0,15.0,28.0,12.0,12.0
17812,/players/nathan-mackinnon,Centre,2011-12,Canada Atlantic U17,WHC-17,,5,1,3,4,-,2,|,,,,,,,
17813,/players/nathan-mackinnon,Centre,2011-12,QMJHL All-Stars,Jr Super Series,,2,0,0,0,0,2,|,,,,,,,
17814,/players/nathan-mackinnon,Centre,2012-13,Halifax Mooseheads,QMJHL,,44,32,43,75,40,45,|,PLAYOFFS,17.0,11.0,22.0,33.0,17.0,12.0
17815,/players/nathan-mackinnon,Centre,2012-13,Canada U18,Hlinka Gretzky Cup,,5,5,6,11,8,14,|,,,,,,,
17816,/players/nathan-mackinnon,Centre,2012-13,Canada U20,WJC-20,,6,0,1,1,-1,4,|,,,,,,,
