In [1]:
import json
import pandas as pd
import csv
import os


In [2]:
current_periods = []
current_teams = tuple()
nullToken = None


In [3]:
def valueOrNull(key, obj):
        if key in obj:
            value = obj[key]
        else:
            value = nullToken
        return value


In [4]:
def getPlayInfo(play):
        about = play["about"]
        periodTime = about["periodTime"]
        period = about["period"]
        eventTeam = play["team"]["name"]
        if(period == 5):
            eventSide = "shootout"
        else:
            team = current_teams.index(eventTeam)
            current_period = current_periods[period - 1]
            if team == 0:
                eventSide = valueOrNull("rinkSide", current_period["home"])
            else:
                eventSide = valueOrNull("rinkSide", current_period["away"])
        result = play["result"]
        eventType = result["event"]
        if eventType == "Goal":
            eventType = 1
        else:
            eventType = 0
        coordinates = play["coordinates"]
        coordinateX = valueOrNull("x", coordinates)
        coordinateY = valueOrNull("y", coordinates)
        shooterName = play["players"][0]["player"]["fullName"]
        goalieName = play["players"][-1]["player"]["fullName"]
        shotType = valueOrNull("secondaryType", result)
        if eventType == 1:
            emptyNet = valueOrNull("emptyNet", result)
            strength = result["strength"]["code"]
        else:
            emptyNet = nullToken
            strength = nullToken
        return [eventType, eventTeam, period, periodTime, eventSide, coordinateX, coordinateY, shooterName, goalieName, shotType, emptyNet, strength]


In [63]:
import numpy as np
def get_distance(x1,x2, y1,y2):
    """
    Calculates the euclidean distance from the event coordinates (x,y) to the goal, 
    based on what side of the rink (Eventside) the event is.
    Returns the euclidean distance rounded to 4 decimal places.
    """
    coord1 = np.array([x1,x2])
    coord2 = np.array([y1,y2])
    
    
    # Distance is the norm of a-b
    try:
        distance = np.linalg.norm(coord1 - coord2)
    except:
        distance = 0
    return round(distance,4)



In [64]:
#Get time differece
import datetime as dt
def get_time_diff(t1, t2):
    start_dt = dt.datetime.strptime(t1, '%M:%S')
    end_dt = dt.datetime.strptime(t2, '%M:%S')
    diff = (end_dt - start_dt) 
    return diff.seconds 

In [68]:
f = open('tidy_features.csv', 'w', newline='')

# create the csv writer
csv_writer = csv.writer(f)

csv_writer.writerow(["gameId", "season", "teamHome", "teamAway", "eventType", "eventTeam", "period", "periodTime",
                        "eventSide", "coordinateX", "coordinateY", "shooterName", "goalieName", "shotType", 
                        "emptyNet", "strength", "last_coordinateX", "last_coordinateY", "lastEventType",
                        "last_periodTime","last_distanceFromNet","time_from_lastEvent", "rebound", "speed"])


for filename in os.listdir("raw_data"):
    # Parse the filename
    name = filename.split('.json')[0]
    is_json = filename[-5:] == '.json'
    print(filename)
    # If it's a json:
    if is_json:
        # Open and load file
        json_file = open("raw_data/"+filename)
        data = json.load(json_file)
        gameId = data["gamePk"]
        season = data["gameData"]["game"]["season"]
        teamHome = data["gameData"]["teams"]["home"]["name"]
        teamAway = data["gameData"]["teams"]["away"]["name"]
        current_teams = (teamHome, teamAway)
        periods = data["liveData"]["linescore"]["periods"]
        current_periods = periods
        allplays=data["liveData"]["plays"]["allPlays"]
        for i in range(0,len(allplays)):
            if allplays[i]["result"]["event"] == "Shot" or allplays[i]["result"]["event"] == "Goal":
                play= getPlayInfo(allplays[i])
                last_coordinates = allplays[i-1]["coordinates"]
                last_coordinateX = valueOrNull("x", last_coordinates)
                last_coordinateY = valueOrNull("y", last_coordinates)
                last_event = allplays[i-1]["result"]["event"]
                last_periodTime = allplays[i-1]["about"]["periodTime"]
                if last_coordinateX is None :
                    last_coordinateX = play[5]
                if last_coordinateY is None :
                    last_coordinateY = play[6]
                lastDistance = get_distance(play[5], play[6], last_coordinateX, last_coordinateY)
                time_last_event = get_time_diff(last_periodTime, play[3])
                rebound=False
                if last_event=="Shot":
                    rebound=True
                speed=0
                if time_last_event!=0:
                    speed=distance/time_last_event
                csv_writer.writerow([gameId,season,teamHome,teamAway] + play + [last_coordinateX, last_coordinateY,
                                    last_event,last_periodTime, lastDistance,time_last_event, rebound, speed])
    json_file.close()
    #break              
f.close()


2015020001.json
2015020002.json
2015020003.json
2015020004.json
2015020005.json
2015020006.json
2015020007.json
2015020008.json
2015020009.json
2015020010.json
2015020011.json
2015020012.json
2015020013.json
2015020014.json
2015020015.json
2015020016.json
2015020017.json
2015020018.json
2015020019.json
2015020020.json
2015020021.json
2015020022.json
2015020023.json
2015020024.json
2015020025.json
2015020026.json
2015020027.json
2015020028.json
2015020029.json
2015020030.json
2015020031.json
2015020032.json
2015020033.json
2015020034.json
2015020035.json
2015020036.json
2015020037.json
2015020038.json
2015020039.json
2015020040.json
2015020041.json
2015020042.json
2015020043.json
2015020044.json
2015020045.json
2015020046.json
2015020047.json
2015020048.json
2015020049.json
2015020050.json
2015020051.json
2015020052.json
2015020053.json
2015020054.json
2015020055.json
2015020056.json
2015020057.json
2015020058.json
2015020059.json
2015020060.json
2015020061.json
2015020062.json
20150200

2015020519.json
2015020520.json
2015020521.json
2015020522.json
2015020523.json
2015020524.json
2015020525.json
2015020526.json
2015020527.json
2015020528.json
2015020529.json
2015020530.json
2015020531.json
2015020532.json
2015020533.json
2015020534.json
2015020535.json
2015020536.json
2015020537.json
2015020538.json
2015020539.json
2015020540.json
2015020541.json
2015020542.json
2015020543.json
2015020544.json
2015020545.json
2015020546.json
2015020547.json
2015020548.json
2015020549.json
2015020550.json
2015020551.json
2015020552.json
2015020553.json
2015020554.json
2015020555.json
2015020556.json
2015020557.json
2015020558.json
2015020559.json
2015020560.json
2015020561.json
2015020562.json
2015020563.json
2015020564.json
2015020565.json
2015020566.json
2015020567.json
2015020568.json
2015020569.json
2015020570.json
2015020571.json
2015020572.json
2015020573.json
2015020574.json
2015020575.json
2015020576.json
2015020577.json
2015020578.json
2015020579.json
2015020580.json
20150205

2015021037.json
2015021038.json
2015021039.json
2015021040.json
2015021041.json
2015021042.json
2015021043.json
2015021044.json
2015021045.json
2015021046.json
2015021047.json
2015021048.json
2015021049.json
2015021050.json
2015021051.json
2015021052.json
2015021053.json
2015021054.json
2015021055.json
2015021056.json
2015021057.json
2015021058.json
2015021059.json
2015021060.json
2015021061.json
2015021062.json
2015021063.json
2015021064.json
2015021065.json
2015021066.json
2015021067.json
2015021068.json
2015021069.json
2015021070.json
2015021071.json
2015021072.json
2015021073.json
2015021074.json
2015021075.json
2015021076.json
2015021077.json
2015021078.json
2015021079.json
2015021080.json
2015021081.json
2015021082.json
2015021083.json
2015021084.json
2015021085.json
2015021086.json
2015021087.json
2015021088.json
2015021089.json
2015021090.json
2015021091.json
2015021092.json
2015021093.json
2015021094.json
2015021095.json
2015021096.json
2015021097.json
2015021098.json
20150210

2016020235.json
2016020236.json
2016020237.json
2016020238.json
2016020239.json
2016020240.json
2016020241.json
2016020242.json
2016020243.json
2016020244.json
2016020245.json
2016020246.json
2016020247.json
2016020248.json
2016020249.json
2016020250.json
2016020251.json
2016020252.json
2016020253.json
2016020254.json
2016020255.json
2016020256.json
2016020257.json
2016020258.json
2016020259.json
2016020260.json
2016020261.json
2016020262.json
2016020263.json
2016020264.json
2016020265.json
2016020266.json
2016020267.json
2016020268.json
2016020269.json
2016020270.json
2016020271.json
2016020272.json
2016020273.json
2016020274.json
2016020275.json
2016020276.json
2016020277.json
2016020278.json
2016020279.json
2016020280.json
2016020281.json
2016020282.json
2016020283.json
2016020284.json
2016020285.json
2016020286.json
2016020287.json
2016020288.json
2016020289.json
2016020290.json
2016020291.json
2016020292.json
2016020293.json
2016020294.json
2016020295.json
2016020296.json
20160202

2016020748.json
2016020749.json
2016020750.json
2016020751.json
2016020752.json
2016020753.json
2016020754.json
2016020755.json
2016020756.json
2016020757.json
2016020758.json
2016020759.json
2016020760.json
2016020761.json
2016020762.json
2016020763.json
2016020764.json
2016020765.json
2016020766.json
2016020767.json
2016020768.json
2016020769.json
2016020770.json
2016020771.json
2016020772.json
2016020773.json
2016020774.json
2016020775.json
2016020776.json
2016020777.json
2016020778.json
2016020779.json
2016020780.json
2016020781.json
2016020782.json
2016020783.json
2016020784.json
2016020785.json
2016020786.json
2016020787.json
2016020788.json
2016020789.json
2016020790.json
2016020791.json
2016020792.json
2016020793.json
2016020794.json
2016020795.json
2016020796.json
2016020797.json
2016020798.json
2016020799.json
2016020800.json
2016020801.json
2016020802.json
2016020803.json
2016020804.json
2016020805.json
2016020806.json
2016020807.json
2016020808.json
2016020809.json
20160208

2016030172.json
2016030173.json
2016030174.json
2016030181.json
2016030182.json
2016030183.json
2016030184.json
2016030185.json
2016030186.json
2016030211.json
2016030212.json
2016030213.json
2016030214.json
2016030215.json
2016030216.json
2016030221.json
2016030222.json
2016030223.json
2016030224.json
2016030225.json
2016030226.json
2016030227.json
2016030231.json
2016030232.json
2016030233.json
2016030234.json
2016030235.json
2016030236.json
2016030241.json
2016030242.json
2016030243.json
2016030244.json
2016030245.json
2016030246.json
2016030247.json
2016030311.json
2016030312.json
2016030313.json
2016030314.json
2016030315.json
2016030316.json
2016030317.json
2016030321.json
2016030322.json
2016030323.json
2016030324.json
2016030325.json
2016030326.json
2016030411.json
2016030412.json
2016030413.json
2016030414.json
2016030415.json
2016030416.json
2017020001.json
2017020002.json
2017020003.json
2017020004.json
2017020005.json
2017020006.json
2017020007.json
2017020008.json
20170200

2017020462.json
2017020463.json
2017020464.json
2017020465.json
2017020466.json
2017020467.json
2017020468.json
2017020469.json
2017020470.json
2017020471.json
2017020472.json
2017020473.json
2017020474.json
2017020475.json
2017020476.json
2017020477.json
2017020478.json
2017020479.json
2017020480.json
2017020481.json
2017020482.json
2017020483.json
2017020484.json
2017020485.json
2017020486.json
2017020487.json
2017020488.json
2017020489.json
2017020490.json
2017020491.json
2017020492.json
2017020493.json
2017020494.json
2017020495.json
2017020496.json
2017020497.json
2017020498.json
2017020499.json
2017020500.json
2017020501.json
2017020502.json
2017020503.json
2017020504.json
2017020505.json
2017020506.json
2017020507.json
2017020508.json
2017020509.json
2017020510.json
2017020511.json
2017020512.json
2017020513.json
2017020514.json
2017020515.json
2017020516.json
2017020517.json
2017020518.json
2017020519.json
2017020520.json
2017020521.json
2017020522.json
2017020523.json
20170205

2017020979.json
2017020980.json
2017020981.json
2017020982.json
2017020983.json
2017020984.json
2017020985.json
2017020986.json
2017020987.json
2017020988.json
2017020989.json
2017020990.json
2017020991.json
2017020992.json
2017020993.json
2017020994.json
2017020995.json
2017020996.json
2017020997.json
2017020998.json
2017020999.json
2017021000.json
2017021001.json
2017021002.json
2017021003.json
2017021004.json
2017021005.json
2017021006.json
2017021007.json
2017021008.json
2017021009.json
2017021010.json
2017021011.json
2017021012.json
2017021013.json
2017021014.json
2017021015.json
2017021016.json
2017021017.json
2017021018.json
2017021019.json
2017021020.json
2017021021.json
2017021022.json
2017021023.json
2017021024.json
2017021025.json
2017021026.json
2017021027.json
2017021028.json
2017021029.json
2017021030.json
2017021031.json
2017021032.json
2017021033.json
2017021034.json
2017021035.json
2017021036.json
2017021037.json
2017021038.json
2017021039.json
2017021040.json
20170210

2018020140.json
2018020141.json
2018020142.json
2018020143.json
2018020144.json
2018020145.json
2018020146.json
2018020147.json
2018020148.json
2018020149.json
2018020150.json
2018020151.json
2018020152.json
2018020153.json
2018020154.json
2018020155.json
2018020156.json
2018020157.json
2018020158.json
2018020159.json
2018020160.json
2018020161.json
2018020162.json
2018020163.json
2018020164.json
2018020165.json
2018020166.json
2018020167.json
2018020168.json
2018020169.json
2018020170.json
2018020171.json
2018020172.json
2018020173.json
2018020174.json
2018020175.json
2018020176.json
2018020177.json
2018020178.json
2018020179.json
2018020180.json
2018020181.json
2018020182.json
2018020183.json
2018020184.json
2018020185.json
2018020186.json
2018020187.json
2018020188.json
2018020189.json
2018020190.json
2018020191.json
2018020192.json
2018020193.json
2018020194.json
2018020195.json
2018020196.json
2018020197.json
2018020198.json
2018020199.json
2018020200.json
2018020201.json
20180202

2018020659.json
2018020660.json
2018020661.json
2018020662.json
2018020663.json
2018020664.json
2018020665.json
2018020666.json
2018020667.json
2018020668.json
2018020669.json
2018020670.json
2018020671.json
2018020672.json
2018020673.json
2018020674.json
2018020675.json
2018020676.json
2018020677.json
2018020678.json
2018020679.json
2018020680.json
2018020681.json
2018020682.json
2018020683.json
2018020684.json
2018020685.json
2018020686.json
2018020687.json
2018020688.json
2018020689.json
2018020690.json
2018020691.json
2018020692.json
2018020693.json
2018020694.json
2018020695.json
2018020696.json
2018020697.json
2018020698.json
2018020699.json
2018020700.json
2018020701.json
2018020702.json
2018020703.json
2018020704.json
2018020705.json
2018020706.json
2018020707.json
2018020708.json
2018020709.json
2018020710.json
2018020711.json
2018020712.json
2018020713.json
2018020714.json
2018020715.json
2018020716.json
2018020717.json
2018020718.json
2018020719.json
2018020720.json
20180207

2018021183.json
2018021184.json
2018021185.json
2018021186.json
2018021187.json
2018021188.json
2018021189.json
2018021190.json
2018021191.json
2018021192.json
2018021193.json
2018021194.json
2018021195.json
2018021196.json
2018021197.json
2018021198.json
2018021199.json
2018021200.json
2018021201.json
2018021202.json
2018021203.json
2018021204.json
2018021205.json
2018021206.json
2018021207.json
2018021208.json
2018021209.json
2018021210.json
2018021211.json
2018021212.json
2018021213.json
2018021214.json
2018021215.json
2018021216.json
2018021217.json
2018021218.json
2018021219.json
2018021220.json
2018021221.json
2018021222.json
2018021223.json
2018021224.json
2018021225.json
2018021226.json
2018021227.json
2018021228.json
2018021229.json
2018021230.json
2018021231.json
2018021232.json
2018021233.json
2018021234.json
2018021235.json
2018021236.json
2018021237.json
2018021238.json
2018021239.json
2018021240.json
2018021241.json
2018021242.json
2018021243.json
2018021244.json
20180212

2019020339.json
2019020340.json
2019020341.json
2019020342.json
2019020343.json
2019020344.json
2019020345.json
2019020346.json
2019020347.json
2019020348.json
2019020349.json
2019020350.json
2019020351.json
2019020352.json
2019020353.json
2019020354.json
2019020355.json
2019020356.json
2019020357.json
2019020358.json
2019020359.json
2019020360.json
2019020361.json
2019020362.json
2019020363.json
2019020364.json
2019020365.json
2019020366.json
2019020367.json
2019020368.json
2019020369.json
2019020370.json
2019020371.json
2019020372.json
2019020373.json
2019020374.json
2019020375.json
2019020376.json
2019020377.json
2019020378.json
2019020379.json
2019020380.json
2019020381.json
2019020382.json
2019020383.json
2019020384.json
2019020385.json
2019020386.json
2019020387.json
2019020388.json
2019020389.json
2019020390.json
2019020391.json
2019020392.json
2019020393.json
2019020394.json
2019020395.json
2019020396.json
2019020397.json
2019020398.json
2019020399.json
2019020400.json
20190204

2019020852.json
2019020853.json
2019020854.json
2019020855.json
2019020856.json
2019020857.json
2019020858.json
2019020859.json
2019020860.json
2019020861.json
2019020862.json
2019020863.json
2019020864.json
2019020865.json
2019020866.json
2019020867.json
2019020868.json
2019020869.json
2019020870.json
2019020871.json
2019020872.json
2019020873.json
2019020874.json
2019020875.json
2019020876.json
2019020877.json
2019020878.json
2019020879.json
2019020880.json
2019020881.json
2019020882.json
2019020883.json
2019020884.json
2019020885.json
2019020886.json
2019020887.json
2019020888.json
2019020889.json
2019020890.json
2019020891.json
2019020892.json
2019020893.json
2019020894.json
2019020895.json
2019020896.json
2019020897.json
2019020898.json
2019020899.json
2019020900.json
2019020901.json
2019020902.json
2019020903.json
2019020904.json
2019020905.json
2019020906.json
2019020907.json
2019020908.json
2019020909.json
2019020910.json
2019020911.json
2019020912.json
2019020913.json
20190209

In [69]:
#Readig csv
df = pd.read_csv('tidy_features.csv')


In [70]:
df.head()

Unnamed: 0,gameId,season,teamHome,teamAway,eventType,eventTeam,period,periodTime,eventSide,coordinateX,...,emptyNet,strength,last_coordinateX,last_coordinateY,last_event,last_periodTime,distance,time_last_event,rebound,speed
0,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,00:51,right,-55.0,...,,,94.0,-34.0,Hit,00:40,154.2757,11,False,14.025064
1,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:05,right,-79.0,...,,,-37.0,-28.0,Giveaway,00:58,50.4777,7,False,7.2111
2,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:06,right,-75.0,...,,,-79.0,0.0,Shot,01:05,4.1231,1,True,4.1231
3,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:53,right,-55.0,...,,,-91.0,35.0,Hit,01:35,79.6053,18,False,4.422517
4,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,02:37,right,-61.0,...,,,-69.0,-22.0,Faceoff,02:32,22.4722,5,False,4.49444


In [72]:
def get_distance_from_post(eventSide,x,y):
    """
    Calculates the euclidean distance from the event coordinates (x,y) to the goal, 
    based on what side of the rink (Eventside) the event is.
    Returns the euclidean distance rounded to 4 decimal places.
    """
    event_pos = np.array([x,y])
    
    # Set the goal position based on the event side (the eventSide marks the team making the shot, the goal is on other side)
    if eventSide == 'right':
        goal_pos = np.array([-89.0,0.0])
    else:
        goal_pos = np.array([89.0,0.0])
    
    # Distance is the norm of a-b
    distance = np.linalg.norm(event_pos - goal_pos)
    
    return round(distance,4)

def get_angle(eventSide,x,y):
    """
    Calculates the angle between the goal (treated as the origin) and the event coordinates (x,y).
    Returns the angle in degrees, rounded to 4 decimal places.
    """
    # Set the goal position as (0,0)
    goal_pos = np.array([0.0,0.0])
    
    # Adjust the event coordinates to account for the goal being at position (0,0)
    # x is moved left or right by 89 depending on eventSide, y is unchanged.
    if eventSide == 'right':
        event_pos = np.array([x+89.0,y])
    else:
        # if eventSide team is on the left, flip it so it's on the right and then adjust
        # This is done so the angle is correct with relation to the net
        event_pos = np.array([(-x)+89.0,y])
        
    # Angle from origin to point (x,y) is np.arctan2()
    angle = np.arctan2(event_pos[1],event_pos[0])
    
    # Convert the angle to degrees and return
    return round(np.rad2deg(angle),4)

def bool_to_digit(x):
    """
    Turns a True to 1 and anything else to 0.
    """
    if x == True:
        return 1
    else:
        return 0

In [78]:
# Use clunky list comprehensions to get lists that apply functions that use different dataframe columns 
df["distanceFromNet"] = [get_distance_from_post(df['eventSide'][i],df['coordinateX'][i],df['coordinateY'][i]) for i,r, in df.iterrows()]
df["angleFromNet"] = [get_angle(df['eventSide'][i],df['coordinateX'][i],df['coordinateY'][i]) for i,r, in df.iterrows()] 
df["angleFromNetLastEvent"] = [get_angle(df['eventSide'][i],df['last_coordinateX'][i],df['last_coordinateY'][i]) for i,r, in df.iterrows()] 
df["isGoal"] = df['eventType']
df["emptyNet"] = [bool_to_digit(df['emptyNet'][i]) for i,r in df.iterrows()]


In [94]:
#df['angleFromNetLastEvent'] = df["rebound"]==True
df["angleFromNetLastEvent"] = np.where(df["rebound"] == True, df['angleFromNetLastEvent'] - df['angleFromNet'], 0)

In [100]:
df.head()

Unnamed: 0,gameId,season,teamHome,teamAway,eventType,eventTeam,period,periodTime,eventSide,coordinateX,...,last_event,last_periodTime,distance,time_last_event,rebound,speed,distanceFromNet,angleFromNet,isGoal,angleFromNetLastEvent
0,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,00:51,right,-55.0,...,Hit,00:40,154.2757,11,False,14.025064,34.5254,10.008,0,0.0
1,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:05,right,-79.0,...,Giveaway,00:58,50.4777,7,False,7.2111,10.0,0.0,0,0.0
2,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:06,right,-75.0,...,Shot,01:05,4.1231,1,True,4.1231,14.0357,-4.0856,0,0.0
3,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,01:53,right,-55.0,...,Hit,01:35,79.6053,18,False,4.422517,49.5177,-46.6366,0,0.0
4,2015020001,20152016,Toronto Maple Leafs,Montréal Canadiens,0,Toronto Maple Leafs,1,02:37,right,-61.0,...,Faceoff,02:32,22.4722,5,False,4.49444,28.0179,-2.0454,0,0.0
