In [1]:
# # Install a conda package in the current Jupyter kernel
# import sys
# !conda install --yes --prefix {sys.prefix} urllib3

In [1]:
import requests
import pprint
import os
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

### Parse list of NHL team links

In [82]:
# get list of links to NHL team pages
team_and_links = {}

try:
    url = 'https://www.eliteprospects.com/league/nhl'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    teams_tags = soup.find('div', {'class': 'leg-home-inner'}).find_all('a')
    for tag in teams_tags:
        team_name = tag.get_text().strip()
        team_link = tag.get('href')
        team_and_links[team_name] = team_link
        
except Exception as e:
    print('failed on page ' + url)
    print(e)

In [83]:
for team_name, team_link in team_and_links.items():
    print(team_link)

https://www.eliteprospects.com/team/1580/anaheim-ducks
https://www.eliteprospects.com/team/72/arizona-coyotes
https://www.eliteprospects.com/team/52/boston-bruins
https://www.eliteprospects.com/team/53/buffalo-sabres
https://www.eliteprospects.com/team/54/calgary-flames
https://www.eliteprospects.com/team/55/carolina-hurricanes
https://www.eliteprospects.com/team/56/chicago-blackhawks
https://www.eliteprospects.com/team/57/colorado-avalanche
https://www.eliteprospects.com/team/58/columbus-blue-jackets
https://www.eliteprospects.com/team/59/dallas-stars
https://www.eliteprospects.com/team/60/detroit-red-wings
https://www.eliteprospects.com/team/61/edmonton-oilers
https://www.eliteprospects.com/team/62/florida-panthers
https://www.eliteprospects.com/team/79/los-angeles-kings
https://www.eliteprospects.com/team/63/minnesota-wild
https://www.eliteprospects.com/team/64/montreal-canadiens
https://www.eliteprospects.com/team/65/nashville-predators
https://www.eliteprospects.com/team/66/new-je

### Parse staff listings of team for a few random years
* 2005-2006
* 2011-2012
* 2015-2016
* 2020-2021

In [84]:
# get list of links to NHL team pages
staff_and_links = {}
seasons = ['2005-2006', '2011-2012', '2015-2016', '2020-2021']


for team_name, team_link in team_and_links.items():
    for i in range(0, len(seasons)):
        try:
            url = team_link + '/' + seasons[i]
            page = urlopen(url)
            html = page.read().decode("utf-8")
            soup = BeautifulSoup(html, 'html.parser')

            staff_tags = soup.find('div', {'class': 'list-as-columns list-as-columns--with-border'}).find_all('a')
            for tag in staff_tags:
                staff_name = tag.get_text().strip()
                staff_link = tag.get('href')
                staff_and_links[staff_name] = staff_link
    
        except Exception as e:
            print('failed on page ' + url)
            print(e)

failed on page https://www.eliteprospects.com/team/27336/seattle-kraken/2005-2006
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/27336/seattle-kraken/2011-2012
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/27336/seattle-kraken/2015-2016
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/22211/vegas-golden-knights/2005-2006
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/22211/vegas-golden-knights/2011-2012
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/22211/vegas-golden-knights/2015-2016
'NoneType' object has no attribute 'find_all'
failed on page https://www.eliteprospects.com/team/9966/winnipeg-jets/2005-2006
'NoneType' object has no attribute 'find_all'


In [85]:
print(str(len(staff_and_links)) + ' staff members found since ' + str(seasons[0]))

2257 staff members found since 2005-2006


In [116]:
# save this list
with open('staff_and_links.json', 'w') as f:
    json.dump(staff_and_links, f)

### Parse position history of each staff member

In [117]:
# read file containing staff and links
with open('staff_and_links.json') as f:
    staff_and_links = json.load(f)
print(str(len(staff_and_links)) + ' staff members found in file')

# convert to list
## since eliteprospects.com has a request limit, we need to mine staff positions in chunks
## therefore, we should store this data as a list so that it is easy to pick up where we left off
staff_and_links_list = []
for name, link in staff_and_links.items():
    staff_and_links_list.append([name, link])
print('num of staff members: ' + str(len(staff_and_links_list)))
print('example of item in list: ' + str(staff_and_links_list[0]))

2257 staff members found in file
num of staff members: 2257
example of item in list: ['Randy Carlyle', 'https://www.eliteprospects.com/staff/184/randy-carlyle']


In [15]:
def parse_position_table(soup, active_staff_all_time_history, name):    
    # get staff member image link
    tag = soup.find('div', {'class': 'ep-entity-header__main-image'}).get('style')
    image = tag[tag.find('files.eliteprospects.com'):tag.find('\');')]

    # get the positions held table
    positions_table = soup.find('div', {'id': 'staff-stats'})

    # create a list of each position held
    positions_table_items = positions_table.find_all('tr')

    # get season, team, league, and role on team
    season = ''
    for p in positions_table_items[1:]:
        try:
            if p.find('td', {'class': 'season sorted'}).get_text().strip() != '':
                season = p.find('td', {'class': 'season sorted'}).get_text().strip()
            team = p.find('td', {'class': 'team'}).find('span').find('a').get_text().strip()
            league = p.find('td', {'class': 'league'}).find('a').get_text().strip()
            role = p.find('td', {'class': 'role'}).get_text().strip()
            notes = p.find('td', {'class': 'notes'}).get_text().strip()

            active_staff_all_time_history.append([name, season, team, league, role, notes, image])
        except Exception:
            print('error in ' + name)
        
    return 

In [96]:
# active_staff_all_time_history = []

# read file containing staff and positions
with open('active_staff_all_time_history.json') as f:
    active_staff_all_time_history = json.load(f)
    
print(len(active_staff_all_time_history))

36558


In [109]:
# calculate starting point
distinct_names = set()
for row in active_staff_all_time_history:
    distinct_names.add(row[0])
distinct_names_len = len(distinct_names)
print('start at: ' + str(distinct_names_len))

try:
#     for i in range(distinct_names_len, len(staff_and_links_list)):
    for i in range(distinct_names_len, distinct_names_len + 14):
        page = urlopen(staff_and_links_list[i][1])
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, 'html.parser')
        parse_position_table(soup, active_staff_all_time_history, staff_and_links_list[i][0])
    
except Exception as e:
    print('failed on page ' + str(staff_and_links_list[i][1]) + ' and count= ' + str(i))
    print(e)
    
# save this list
with open('active_staff_all_time_history.json', 'w') as f:
    json.dump(active_staff_all_time_history, f)

start at: 2059


IndexError: list index out of range

In [114]:
# save this list
with open('active_staff_all_time_history.json', 'w') as f:
    json.dump(active_staff_all_time_history, f)

In [110]:
# calculate number of distinct staff members
distinct_names = set()
for row in active_staff_all_time_history:
    distinct_names.add(row[0])
distinct_names_len = len(distinct_names)
print('distinct: ' + str(distinct_names_len))
print('total staff years: ' + str(len(active_staff_all_time_history)))

distinct: 2257
total staff years: 43439


### Convert to df

In [112]:
df = pd.DataFrame(active_staff_all_time_history)
df.columns = ['name', 'season', 'team', 'league', 'position', 'notes', 'image']
df

Unnamed: 0,name,season,team,league,position,notes,image
0,Randy Carlyle,1994-95,Winnipeg Jets,NHL,Dir. of Player Development,,files.eliteprospects.com/layout/staff/hc-carly...
1,Randy Carlyle,1995-96,Winnipeg Jets,NHL,Asst. Coach,,files.eliteprospects.com/layout/staff/hc-carly...
2,Randy Carlyle,1996-97,Manitoba Moose,IHL,Asst. Coach,,files.eliteprospects.com/layout/staff/hc-carly...
3,Randy Carlyle,1996-97,Manitoba Moose,IHL,GM/Head Coach,Replaced Jean Perron mid-season,files.eliteprospects.com/layout/staff/hc-carly...
4,Randy Carlyle,1997-98,Manitoba Moose,IHL,GM/Head Coach,,files.eliteprospects.com/layout/staff/hc-carly...
...,...,...,...,...,...,...,...
43434,Dan Shrader,2017-18,Winnipeg Jets,NHL,Scout,Amateur Scout,
43435,Dan Shrader,2018-19,Winnipeg Jets,NHL,Scout,Amateur Scout,
43436,Dan Shrader,2019-20,Winnipeg Jets,NHL,Scout,Amateur Scout,
43437,Dan Shrader,2020-21,Winnipeg Jets,NHL,Scout,Amateur Scout,


In [113]:
df[df['name']=='Chuck Fletcher']

Unnamed: 0,name,season,team,league,position,notes,image
183,Chuck Fletcher,1993-94,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
184,Chuck Fletcher,1994-95,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
185,Chuck Fletcher,1995-96,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
186,Chuck Fletcher,1996-97,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
187,Chuck Fletcher,1997-98,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
188,Chuck Fletcher,1998-99,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
189,Chuck Fletcher,1999-00,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
190,Chuck Fletcher,1999-00,Louisville Panthers,AHL,General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
191,Chuck Fletcher,2000-01,Florida Panthers,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...
192,Chuck Fletcher,2000-01,Louisville Panthers,AHL,General Manager,,files.eliteprospects.com/layout/staff/chuck_fl...


In [48]:
# pd.set_option('display.max_rows', 100)
df[df['name']=='Chuck Fletcher']['position'].value_counts()

General Manager                18
Asst. General Manager          15
Ex. VP of Hockey Operations     6
Dir. of Hockey Operations       3
Pres. of Hockey Operations      2
Senior Advisor                  1
Name: position, dtype: int64

In [49]:
df[(df['name'] != 'Chuck Fletcher') & (df['team'] == 'Minnesota Wild') & (df['season'] == '2013-14')]

Unnamed: 0,name,season,team,league,position,notes,image
516,Brent Flahr,2013-14,Minnesota Wild,NHL,Asst. General Manager,,files.eliteprospects.com/layout/staff/flahr_br...
587,Pavel Routa,2013-14,Minnesota Wild,NHL,Scout,European Scout,
3541,Donnie Fuller,2013-14,Minnesota Wild,NHL,Athletic Trainer,Head Athletic Trainer,


In [50]:
# use case: find all staff members Chuck Fletcher has worked with in his career
for index, row in df[df['name'] == 'Chuck Fletcher'].iterrows():
    print(row['season'], row['team'])
    print(df[(df['name'] != 'Chuck Fletcher') & (df['team'] == row['team']) & (df['season'] == row['season'])]['name'].values)
    

1993-94 Florida Panthers
['Lindy Ruff' 'Jon Christiano' 'Tom Webster' 'Tom Webster']
1994-95 Florida Panthers
['Lindy Ruff' 'Jon Christiano' 'Tim Murray']
1995-96 Florida Panthers
['Lindy Ruff' 'Tim Murray']
1996-97 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Lindy Ruff' 'Tim Murray']
1997-98 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Tim Murray']
1998-99 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Terry Murray' 'Tim Murray']
1999-00 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Pavel Routa' 'Terry Murray' 'Tim Murray']
1999-00 Louisville Panthers
[]
2000-01 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Pavel Routa' 'Terry Murray' 'Tim Murray']
2000-01 Louisville Panthers
[]
2001-02 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Pavel Routa' 'Tim Murray']
2001-02 Florida Panthers
['Brent Flahr' 'Wayne Meier' 'Pavel Routa' 'Tim Murray']
2002-03 Mighty Ducks of Anaheim
['François Allaire' 'David McNab' 'Sean Skahan' 'Alain Chainey'
 'Jan-Åke Danielson' 'Wayne Meier' 'Paul Mac