# Introduction

This is an analysis of data from the National Hockey League. This was done for Foundations of Data Science in Fall of 2019. This project was done by Group 5:
- Andokie Ibeshi
- Daniel Basilio
- Prabuddh Dixit
- Vineeta Kuckreja


## Data Notes
All of the data in this analysis was fetched from Natural Stat Trick. We used individal player stats for the last five years, regular season data only. Each of the CSVs contains all players who registered any game time throughout that year. We got the data as individual year CSV's for a few reasons:
- The data was a lot easier to download in chunks
- The data does not have a year label, so by importing them separately, we can attach year labels from the filenames

### A note on years
Hockey seasons span two calendar years because the season starts in October each year, and continues until the playoffs begin, typically in April. There have been events which shift the start/end dates (the World Cup of Hockey in 2016, the Olympics in 2014) but October -> April should apply most years. Each row of data would need to be marked with two years (i.e. 2014-2015) to represent the two calendar years the season was played over. To keep the year data simpler to understand, we will mark all the data as the year that the Stanley Cup was awared. For example, 2014-2015 will be marked as simply 2015, as that seasons Stanley Cup was awarded at the end of the season in 2015.

In [5]:
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [6]:
def map_positions(position):
    if 'C' in position or 'R' in position or 'L' in position:
        return 'F'
    else:
        return position

In [7]:
def read_player_data(filename):
    data = pd.read_csv(
        filename,
        usecols=[
            "Player",
            "Team",
            "Position",
            "GP",
            "Goals",
            "Total Assists",
            "First Assists",
            "Second Assists",
            "Total Points",
            "iCF",
            "Shots",
            "SH%",
            "Hits",
            "Hits Taken",
        ]
    )
    return data

In [8]:
def clean_player_data(data, year):
    # Remove all players who failed to register at least 20 games played.
    data = data.drop(data[data["GP"] < 20].index)
    
    # Cast the SH% column to float (it imports as object for some reason)
    data = data.astype({"SH%": "float64"})
    
    # Attach the year as a column
    data["Year"] = year
    
    # Rename iCF to Shot Attempts to be clearer
    data = data.rename(columns={"iCF": "Shot Attempts"})
    
    # Map all forward positions to just 'F' to make the data nicer to work with
    data["Position"] = data["Position"].apply(map_positions)
    
    # Set the index to be a combo of player and year
    data = data.set_index(["Year", "Player"])
    
    return data

In [9]:
# Read and clean all of our data
def read_and_clean_player_data(filename, year):
    year_data = read_player_data(filename)
    cleaned_year_data = clean_player_data(year_data, year)
    return cleaned_year_data

In [10]:
# Read and combine all of Data Frames together
def fetch_player_data():
    fourteen_fifteen = read_and_clean_player_data('./yearly-player-data/14-15.csv', '2015')
    fifteen_sixteen = read_and_clean_player_data('./yearly-player-data/15-16.csv', '2016')
    sixteen_seventeen = read_and_clean_player_data('./yearly-player-data/16-17.csv', '2017')
    seventeen_eighteen = read_and_clean_player_data('./yearly-player-data/17-18.csv', '2018')
    eighteen_nineteen = read_and_clean_player_data('./yearly-player-data/18-19.csv', '2019')
    player_data = fourteen_fifteen.append([
        fifteen_sixteen,
        sixteen_seventeen,
        seventeen_eighteen,
        eighteen_nineteen,
    ])
    return player_data

In [11]:
fetch_player_data()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Position,GP,Goals,Total Assists,First Assists,Second Assists,Total Points,Shots,SH%,Shot Attempts,Hits,Hits Taken
Year,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015,Jamie Benn,DAL,F,82,35,52,32,20,87,253,13.83,466,120,101
2015,John Tavares,NYI,F,82,38,48,30,18,86,278,13.67,471,48,103
2015,Sidney Crosby,PIT,F,77,28,56,31,25,84,237,11.81,380,66,117
2015,Alex Ovechkin,WSH,F,81,53,28,21,7,81,395,13.42,825,259,108
2015,Jakub Voracek,PHI,F,82,22,59,31,28,81,221,9.95,424,55,139
2015,Nicklas Backstrom,WSH,F,82,18,60,31,29,78,153,11.76,283,91,100
2015,Tyler Seguin,DAL,F,71,37,40,33,7,77,280,13.21,529,63,34
2015,Daniel Sedin,VAN,F,82,20,56,35,21,76,226,8.85,370,21,87
2015,Jiri Hudler,CGY,F,78,31,45,29,16,76,158,19.62,270,27,39
2015,Henrik Sedin,VAN,F,82,18,55,28,27,73,101,17.82,165,25,67


In [12]:
def read_team_data(filename):
    data = pd.read_csv(
        filename,
        usecols=[
            "Team",
            "W",
            "L",
            "OTL",
            "Points",
        ]
    )
    return data

In [13]:
def clean_team_data(data, year):
    # Rename a few columns to make them easier to understand 
    data = data.rename(columns={"W": "Wins", "L": "Losses", "OTL": "Overtime Losses"})
    
    # Attach the year as a column
    data["Year"] = year
    
    # Replacing team names with abbreviations
    data = data.replace({'Montreal Canadiens':'MTL', 'Toronto Maple Leafs':'TOR', 'Philadelphia Flyers':'PHI',
       'Boston Bruins':'BOS', 'Vancouver Canucks':'VAN', 'Calgary Flames':'CGY',
       'San Jose Sharks':'S.J', 'Los Angeles Kings':'L.A', 'Columbus Blue Jackets':'CBJ',
       'Buffalo Sabres':'BUF', 'New Jersey Devils':'N.J', 'Anaheim Ducks':'ANA',
       'Pittsburgh Penguins':'PIT', 'Washington Capitals':'WSH', 'Detroit Red Wings':'DET',
       'Florida Panthers':'FLA', 'Tampa Bay Lightning':'T.B', 'New York Rangers':'NYR',
       'St Louis Blues':'STL', 'Ottawa Senators':'OTT', 'Nashville Predators':'NSH',
       'Chicago Blackhawks':'CHI', 'Dallas Stars':'DAL', 'Colorado Avalanche':'COL',
       'Minnesota Wild':'MIN', 'Edmonton Oilers':'EDM', 'Winnipeg Jets':'WPG',
       'Arizona Coyotes':'ARI', 'New York Islanders':'NYI', 'Carolina Hurricanes':'CAR',
       'Vegas Golden Knights':'VGK'})
    
    # Set the index to be a combo of team and year
    data = data.set_index(["Year", "Team"])
    
    return data 

In [14]:
def read_and_clean_team_data(filename, year):
    year_data = read_team_data(filename)
    cleaned_year_data = clean_team_data(year_data, year)
    return cleaned_year_data

In [15]:
def fetch_team_data():
    fourteen_fifteen = read_and_clean_team_data('./yearly-team-data/14-15.csv', '2015')
    fifteen_sixteen = read_and_clean_team_data('./yearly-team-data/15-16.csv', '2016')
    sixteen_seventeen = read_and_clean_team_data('./yearly-team-data/16-17.csv', '2017')
    seventeen_eighteen = read_and_clean_team_data('./yearly-team-data/17-18.csv', '2018')
    eighteen_nineteen = read_and_clean_team_data('./yearly-team-data/18-19.csv', '2019')
    team_data = fourteen_fifteen.append([
        fifteen_sixteen,
        sixteen_seventeen,
        seventeen_eighteen,
        eighteen_nineteen,
    ])
    return team_data

In [16]:
fetch_team_data()

Unnamed: 0_level_0,Unnamed: 1_level_0,Wins,Losses,Overtime Losses,Points
Year,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,ANA,51,24,7,109
2015,ARI,24,50,8,56
2015,BOS,41,27,14,96
2015,BUF,23,51,8,54
2015,CAR,30,41,11,71
2015,CBJ,42,35,5,89
2015,CGY,45,30,7,97
2015,CHI,48,28,6,102
2015,COL,39,31,12,90
2015,DAL,41,31,10,92
