# Automate Mining of Line Combinations
Source: www.dailyfaceoff.com

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import json

In [2]:
# First get a list of team names
import requests

url = "https://www.dailyfaceoff.com/teams/"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
    'Content-Type': 'text/html',
}

response = requests.get(url, headers=headers)  # get html from url
soup = BeautifulSoup(response.text)            # convert to soup...easier to parse

# add all team names to a list
team_names = []
for tag in soup.find_all("h5", {"class": "team-list-name"}):
    team_names.append(tag.string)
print(team_names)
print(len(team_names))

['Anaheim Ducks', 'Arizona Coyotes', 'Boston Bruins', 'Buffalo Sabres', 'Calgary Flames', 'Carolina Hurricanes', 'Chicago Blackhawks', 'Colorado Avalanche', 'Columbus Blue Jackets', 'Dallas Stars', 'Detroit Red Wings', 'Edmonton Oilers', 'Florida Panthers', 'Los Angeles Kings', 'Minnesota Wild', 'Montreal Canadiens', 'Nashville Predators', 'New Jersey Devils', 'New York Islanders', 'New York Rangers', 'Ottawa Senators', 'Philadelphia Flyers', 'Pittsburgh Penguins', 'San Jose Sharks', 'St Louis Blues', 'Tampa Bay Lightning', 'Toronto Maple Leafs', 'Vancouver Canucks', 'Vegas Golden Knights', 'Washington Capitals', 'Winnipeg Jets']
31


In [3]:
# create a dictionary that stores lines for each team
line_combinations = {}

for i in range(0, len(team_names)):
    name = team_names[i].rsplit(' ',1)[1]
    
    # create placeholders for forward lines
    for i in range(1,5):
        new_key = name + '_F' + str(i)
        line_combinations[new_key] = {}
    
    # create placeholders for defense pairings
    for i in range(1,4):
        new_key = name + '_D' + str(i)
        line_combinations[new_key] = {}
    
    # create placeholders for pp lines
    for i in range(1,3):
        new_key = name + '_PP' + str(i)
        line_combinations[new_key] = {}
        
    # create placeholders for G lines
    for i in range(1,3):
        new_key = name + '_G' + str(i)
        line_combinations[new_key] = {}
    
    # create placeholder for Injured players
    new_key = name + '_IR'
    line_combinations[new_key] = {}
pprint.pprint(line_combinations)

{'Avalanche_D1': {},
 'Avalanche_D2': {},
 'Avalanche_D3': {},
 'Avalanche_F1': {},
 'Avalanche_F2': {},
 'Avalanche_F3': {},
 'Avalanche_F4': {},
 'Avalanche_G1': {},
 'Avalanche_G2': {},
 'Avalanche_IR': {},
 'Avalanche_PP1': {},
 'Avalanche_PP2': {},
 'Blackhawks_D1': {},
 'Blackhawks_D2': {},
 'Blackhawks_D3': {},
 'Blackhawks_F1': {},
 'Blackhawks_F2': {},
 'Blackhawks_F3': {},
 'Blackhawks_F4': {},
 'Blackhawks_G1': {},
 'Blackhawks_G2': {},
 'Blackhawks_IR': {},
 'Blackhawks_PP1': {},
 'Blackhawks_PP2': {},
 'Blues_D1': {},
 'Blues_D2': {},
 'Blues_D3': {},
 'Blues_F1': {},
 'Blues_F2': {},
 'Blues_F3': {},
 'Blues_F4': {},
 'Blues_G1': {},
 'Blues_G2': {},
 'Blues_IR': {},
 'Blues_PP1': {},
 'Blues_PP2': {},
 'Bruins_D1': {},
 'Bruins_D2': {},
 'Bruins_D3': {},
 'Bruins_F1': {},
 'Bruins_F2': {},
 'Bruins_F3': {},
 'Bruins_F4': {},
 'Bruins_G1': {},
 'Bruins_G2': {},
 'Bruins_IR': {},
 'Bruins_PP1': {},
 'Bruins_PP2': {},
 'Canadiens_D1': {},
 'Canadiens_D2': {},
 'Canadiens_D3

In [4]:
# get each each group of lines for a team
for i in range(0, len(team_names)):
    # Build URL Request
    url = "https://www.dailyfaceoff.com/teams/" + team_names[i].replace(' ', '-') + '/line-combinations'  # build url
    response = requests.get(url, headers=headers)  # get html from url
    soup = BeautifulSoup(response.text)            # convert to soup...easier to parse
    
    # loop through all player names from webpage
    index = 0  # keep track of index so we know which line a player belongs to
    line = ''
    pos = ''
    name = team_names[i].rsplit(' ',1)[1]
    for tag in soup.find_all("span", {"class": "player-name"}):
        # identify line given order player is in
        if index < 3:    line = name + '_F1'
        elif index < 6:  line = name + '_F2'
        elif index < 9:  line = name + '_F3'
        elif index < 12: line = name + '_F4'
        elif index < 14: line = name + '_D1'
        elif index < 16: line = name + '_D2'
        elif index < 18: line = name + '_D3'
        elif index < 23: line = name + '_PP1'
        elif index < 28: line = name + '_PP2'
        elif index < 29: line = name + '_G1'
        elif index < 30: line = name + '_G2'
        else:            line = name + '_IR'   
            
        # identify position given order player is in
        if index < 12:                     # Fowards
            pos_id = index % 3
            if pos_id == 0:   pos = 'LW'
            elif pos_id == 1: pos = 'C'
            elif pos_id == 2: pos = 'RW'
        elif index < 18:                   # Defensemen
            pos_id = index % 2
            if pos_id == 0:   pos = 'LD'
            elif pos_id == 1: pos = 'RD' 
        elif index < 28:                   # Powerplay
            pos_id = index % 5
            if pos_id == 3:   pos = 'LW'
            elif pos_id == 4: pos = 'C'
            elif pos_id == 0: pos = 'RW'
            elif pos_id == 1: pos = 'LD'
            elif pos_id == 2: pos = 'RD'
        elif index < 30:
            pos = 'G'
        else:
            pos = 'IR'
                
        line_combinations[line][tag.string] = pos
        index = index + 1
#         print(pos + ' ' + line + ' ' + tag.string)
    

In [5]:
# verify that line combinations are accurate
pprint.pprint(line_combinations)

{'Avalanche_D1': {'Cale Makar': 'RD', 'Devon Toews': 'LD'},
 'Avalanche_D2': {'Ryan Graves': 'RD', 'Samuel Girard': 'LD'},
 'Avalanche_D3': {'Conor Timmins': 'LD', 'Ian Cole': 'RD'},
 'Avalanche_F1': {'Gabriel Landeskog': 'LW',
                  'Mikko Rantanen': 'RW',
                  'Nathan Mackinnon': 'C'},
 'Avalanche_F2': {'Andre Burakovsky': 'RW',
                  'Brandon Saad': 'LW',
                  'Nazem Kadri': 'C'},
 'Avalanche_F3': {'JT Compher': 'C',
                  'Joonas Donskoi': 'RW',
                  'Valeri Nichushkin': 'LW'},
 'Avalanche_F4': {'Matt Calvert': 'RW',
                  'Pierre-Édouard Bellemare': 'C',
                  'Tyson Jost': 'LW'},
 'Avalanche_G1': {'Philipp Grubauer': 'G'},
 'Avalanche_G2': {'Pavel Francouz': 'G'},
 'Avalanche_IR': {'Erik Johnson': 'IR'},
 'Avalanche_PP1': {'Cale Makar': 'RD',
                   'Gabriel Landeskog': 'LW',
                   'Mikko Rantanen': 'RW',
                   'Nathan Mackinnon': 'LD',
        

               'Joe Thornton': 'C',
               'Mitch Marner': 'RW',
               'Morgan Rielly': 'LD',
               'Wayne Simmonds': 'LW'},
 'Leafs_PP2': {'Jason Spezza': 'RW',
               'John Tavares': 'C',
               'T.J. Brodie': 'RD',
               'William Nylander': 'LD',
               'Zach Hyman': 'LW'},
 'Lightning_D1': {'Jan Rutta': 'RD', 'Victor Hedman': 'LD'},
 'Lightning_D2': {'Erik Cernak': 'RD', 'Ryan McDonagh': 'LD'},
 'Lightning_D3': {'Luke Schenn': 'RD', 'Mikhail Sergachev': 'LD'},
 'Lightning_F1': {'Brayden Point': 'C',
                  'Ondrej Palat': 'LW',
                  'Steven Stamkos': 'RW'},
 'Lightning_F2': {'Alex Killorn': 'LW',
                  'Anthony Cirelli': 'C',
                  'Tyler Johnson': 'RW'},
 'Lightning_F3': {'Barclay Goodrow': 'RW',
                  'Blake Coleman': 'LW',
                  'Yanni Gourde': 'C'},
 'Lightning_F4': {'Mathieu Joseph': 'RW',
                  'Mitchell Stephens': 'C',
               

In [8]:
with open('./input/lineCombinations.json', 'w') as fp:
    json.dump(line_combinations, fp)