In [1]:
import requests
import json
import pprint
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import datetime
import os

# Scraping Trade Data
We will be scraping trade data by season from the source: https://www.nhltradetracker.com/user/trade_list_by_season/2020-21/1

_Note: the below cells are contain exploratory code...to run data scraper, skip to the bottom of the notebook_

In [2]:
url = 'https://www.nhltradetracker.com/user/trade_list_by_season/2020-21/1'
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
trades = match_results.group()



In [3]:
soup = BeautifulSoup(html, 'html.parser')

In [4]:
# print(soup.prettify())

In [5]:
trade_tables = soup.find('div', id='container').find_all('table', align='center')
print(len(trade_tables))

19


In [6]:
tr = []
for i in range(1):
    print(trade_tables[i])
    tr = trade_tables[i].find_all('tr')
    print('\n\n\n')

<table align="center" style="border:1px solid #666666; margin-top:5px;" width="95%">
<input type="hidden" value="10203"/>
<tr>
<td align="center" class="label" width="40%"><strong>Winnipeg Jets acquire</strong></td>
<td align="center" class="label" width="20%"><strong>Date</strong></td>
<td align="center" class="label" width="40%"><strong>Vegas Golden Knights acquire</strong></td>
</tr>
<tr>
<td valign="top" width="40%">
<table width="100%">
<tr>
<td width="25%">
<img height="40" src="/asset/team_logos/Winnipeg_Jets.gif" width="60"/>
</td>
<td valign="top" width="75%">
<span class="link"><a href="javascript:show('69840||| Paul Stastny')"> Paul Stastny</a><br/></span>
</td>
</tr>
</table>
</td>
<td align="center" valign="top" width="20%">October 9, 2020</td>
<td valign="top" width="40%">
<table width="100%">
<tr>
<td valign="top" width="75%">
<span class="link"><a href="javascript:show('168005||| Carl Dahlstrom')"> Carl Dahlstrom</a><br/></span>
<span class="black">2022 conditional 4th 

In [7]:
trade_row_header_items = [content.getText().strip(' acquire') for content in tr[0].find_all('td', align='center')]
trade_row_header_items

['Winnipeg Jets', 'Dat', 'Vegas Golden Knights']

In [8]:
print('Winnipeg Jets acquire'.strip(' acquire'))

Winnipeg Jets


In [9]:
def return_players_involved(players):
    return [player.getText().strip() for player in players.find_all('span')]

for players in tr[1].find_all('td', width='75%'):
    print(return_players_involved(players))

['Paul Stastny']
['Carl Dahlstrom', '2022 conditional 4th round pick']


In [10]:
tr[1].find('td', width='20%')

<td align="center" valign="top" width="20%">October 9, 2020</td>

In [11]:
def return_teams_involved(teams_html):
    return [team.getText().strip('acquire').strip() for team in teams_html.find_all('td', align='center')]

def return_players_involved(players_html):
    players_by_team = []
    for players in players_html.find_all('td', width='75%'):
        players_by_team.append([player.getText().strip() for player in players.find_all('span')])
    return players_by_team

def return_date(players_html):
    date_str = players_html.find('td', width='20%').getText()
    return datetime.datetime.strptime(date_str, '%B %d, %Y').date().strftime("%d%b%Y")

def return_trades_from_html(html):
    trades = []
    trade_tables = html.find('div', id='container').find_all('table', align='center')
    for i in range(len(trade_tables)):
        # extract html
        tr = trade_tables[i].find_all('tr')

        # extract teams involved in trade
        teams_list = return_teams_involved(tr[0])

        # extract players involved in trade
        players_list = return_players_involved(tr[1])

        # extract date of trade
        date_of_trade = return_date(tr[1])

        # form dict object
        trade = {
            'team1': teams_list[0],
            'team2': teams_list[2],
            'team1_aquires': players_list[0],
            'team2_aquires': players_list[1],
            'date': date_of_trade
        }
        trades.append(trade)
    return trades

## Run data scraping to collect trades since 2000-01 season

In [12]:
trades_by_season = {}

start_szn = 2019

while start_szn < 2020:
    
    # format http link
    szn_str = str(start_szn) + '-' + str(start_szn+1)[2:]
    page_num = 1
    
    trades_for_szn = []
    while True:
        url = 'https://www.nhltradetracker.com/user/trade_list_by_season/' + szn_str + '/' + str(page_num)
        page = urlopen(url)
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, 'html.parser')
        
        trades_for_this_page = return_trades_from_html(soup)
        if len(trades_for_this_page) == 0:
            break
        trades_for_szn.extend(trades_for_this_page)
        
        page_num += 1
    
    
    trades_by_season[szn_str] = trades_for_szn
    start_szn += 1
    
    
# save to json file in data directory
path = os.getcwd()
path = os.path.abspath(os.path.join(path, os.pardir, 'data/trades.json'))
with open(path, 'w') as f:
        json.dump(trades_by_season, f, indent=2)

In [13]:
print('number of trades per season:')
for key, value in trades_by_season.items():
    print(key + ': ' + str(len(value)))

number of trades per season:
2019-20: 110


## Find unique team names
We need to figure out all team names included in trades within the past 20 years. Use this data to manually collect GM history

In [14]:
seen_teams = set()
for year, trades in trades_by_season.items():
    for trade in trades:
        seen_teams.add(trade['team1'])
        seen_teams.add(trade['team2'])

In [15]:
seen_teams

{'Anaheim Ducks',
 'Arizona Coyotes',
 'Boston Bruins',
 'Buffalo Sabres',
 'Calgary Flames',
 'Carolina Hurricanes',
 'Chicago Blackhawks',
 'Colorado Avalanche',
 'Columbus Blue Jackets',
 'Dallas Stars',
 'Detroit Red Wings',
 'Edmonton Oilers',
 'Florida Panthers',
 'Los Angeles Kings',
 'Minnesota Wild',
 'Montreal Canadiens',
 'Nashville Predators',
 'New Jersey Devils',
 'New York Islanders',
 'New York Rangers',
 'Ottawa Senators',
 'Philadelphia Flyers',
 'Pittsburgh Penguins',
 'San Jose Sharks',
 'St. Louis Blues',
 'Tampa Bay Lightning',
 'Toronto Maple Leafs',
 'Vancouver Canucks',
 'Vegas Golden Knights',
 'Washington Capitals',
 'Winnipeg Jets'}

In [18]:
for year, trades in trades_by_season.items():
    for trade in trades:
        pprint.pprint(trade)
        break
        print(trade['team1'] + ' and ' + trade['team2'])

{'date': '24Feb2020',
 'team1': 'New York Islanders',
 'team1_aquires': ['Jean-Gabriel Pageau'],
 'team2': 'Ottawa Senators',
 'team2_aquires': ['2020 conditional 1st round pick',
                   '2020 2nd round pick',
                   '2022 conditional 3rd round pick']}
