In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display


# need to find a way to store/retrieve this dataset online
df_full = pd.read_csv("C:/Users/ssegg/OneDrive/Desktop/shots_2023.csv")

# filter to shots taken during regular season play by the Tampa Bay Lightning
df = df_full[(df_full['teamCode'] == 'TBL')&(df_full['isPlayoffGame'] == 0)]

In [4]:
    # 1. Basic Information
    column_list = df.columns.tolist()
    print("Column names as a list:")
    print(column_list)
    print("/n")

    print("Dataset Shape:", df.shape)
    print("\nFirst 5 rows:")
    display(df.head())
    
    print("\nBasic Information:")
    display(df.info())
    
    print("\nSummary Statistics:")
    display(df.describe(include='all').T)
    
    print("\nMissing Values:")
    missing = df.isnull().sum()
    display(pd.DataFrame({'Missing Values': missing, 'Percentage': (missing / len(df)) * 100}))
    
    # 2. Data Types and Unique Values
    print("\nData Types:")
    display(pd.DataFrame(df.dtypes, columns=['Data Type']))
    
    # 3. Categorical Data Exploration
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        print("\nCategorical Columns:")
        for col in categorical_cols:
            print(f"\n{col} - Unique Values: {df[col].nunique()}")
            display(df[col].value_counts().head(10))
    


Column names as a list:
['shotID', 'arenaAdjustedShotDistance', 'arenaAdjustedXCord', 'arenaAdjustedXCordABS', 'arenaAdjustedYCord', 'arenaAdjustedYCordAbs', 'averageRestDifference', 'awayEmptyNet', 'awayPenalty1Length', 'awayPenalty1TimeLeft', 'awaySkatersOnIce', 'awayTeamCode', 'awayTeamGoals', 'defendingTeamAverageTimeOnIce', 'defendingTeamAverageTimeOnIceOfDefencemen', 'defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff', 'defendingTeamAverageTimeOnIceOfForwards', 'defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff', 'defendingTeamAverageTimeOnIceSinceFaceoff', 'defendingTeamDefencemenOnIce', 'defendingTeamForwardsOnIce', 'defendingTeamMaxTimeOnIce', 'defendingTeamMaxTimeOnIceOfDefencemen', 'defendingTeamMaxTimeOnIceOfDefencemenSinceFaceoff', 'defendingTeamMaxTimeOnIceOfForwards', 'defendingTeamMaxTimeOnIceOfForwardsSinceFaceoff', 'defendingTeamMaxTimeOnIceSinceFaceoff', 'defendingTeamMinTimeOnIce', 'defendingTeamMinTimeOnIceOfDefencemen', 'defendingTeamMinTimeOnIceOfDefencemen

Unnamed: 0,shotID,arenaAdjustedShotDistance,arenaAdjustedXCord,arenaAdjustedXCordABS,arenaAdjustedYCord,arenaAdjustedYCordAbs,averageRestDifference,awayEmptyNet,awayPenalty1Length,awayPenalty1TimeLeft,...,xCordAdjusted,xFroze,xGoal,xPlayContinuedInZone,xPlayContinuedOutsideZone,xPlayStopped,xRebound,xShotWasOnGoal,yCord,yCordAdjusted
0,0,39.698866,59.0,59.0,-26.0,26.0,-3.4,0,0,0,...,59,0.249174,0.035813,0.37931,0.277143,0.022744,0.035816,0.75132,-26,-26
1,1,11.313708,81.0,81.0,8.0,8.0,-3.4,0,0,0,...,81,0.107945,0.057705,0.464358,0.298119,0.020588,0.051284,0.589712,8,8
2,2,45.343136,55.0,55.0,30.0,30.0,-3.8,0,0,0,...,55,0.236193,0.016126,0.390417,0.301375,0.025621,0.030269,0.785765,30,30
3,3,43.139309,58.0,58.0,-30.0,30.0,-3.4,0,0,0,...,58,0.198635,0.016797,0.438052,0.291724,0.021107,0.033686,0.724471,-30,-30
5,5,19.924859,83.0,83.0,19.0,19.0,-5.6,0,0,0,...,83,0.109021,0.055857,0.447813,0.322623,0.022224,0.042463,0.702835,19,19



Basic Information:
<class 'pandas.core.frame.DataFrame'>
Index: 3516 entries, 0 to 114131
Columns: 124 entries, shotID to yCordAdjusted
dtypes: float64(38), int64(73), object(13)
memory usage: 3.4+ MB


None


Summary Statistics:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
shotID,3516.0,,,,54951.933732,33556.250803,0.0,26157.25,52145.0,81583.25,114131.0
arenaAdjustedShotDistance,3516.0,,,,33.688434,19.185413,1.0,17.720045,32.0,47.0,98.0
arenaAdjustedXCord,3516.0,,,,14.242321,63.07753,-99.0,-56.0,42.0,70.0,99.0
arenaAdjustedXCordABS,3516.0,,,,61.836746,18.889288,0.0,49.0,66.0,77.0,99.0
arenaAdjustedYCord,3516.0,,,,-0.228669,20.205601,-46.0,-15.0,-0.0,15.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...
xPlayStopped,3516.0,,,,0.024188,0.008041,0.001956,0.01984,0.022258,0.026076,0.148661
xRebound,3516.0,,,,0.050465,0.023613,0.003536,0.036129,0.044541,0.060423,0.215391
xShotWasOnGoal,3516.0,,,,0.728046,0.11308,0.3,0.677109,0.729499,0.786683,0.983755
yCord,3516.0,,,,-0.209044,19.880893,-42.0,-15.0,0.0,15.0,42.0



Missing Values:


Unnamed: 0,Missing Values,Percentage
shotID,0,0.0
arenaAdjustedShotDistance,0,0.0
arenaAdjustedXCord,0,0.0
arenaAdjustedXCordABS,0,0.0
arenaAdjustedYCord,0,0.0
...,...,...
xPlayStopped,0,0.0
xRebound,0,0.0
xShotWasOnGoal,0,0.0
yCord,0,0.0



Data Types:


Unnamed: 0,Data Type
shotID,int64
arenaAdjustedShotDistance,float64
arenaAdjustedXCord,float64
arenaAdjustedXCordABS,float64
arenaAdjustedYCord,float64
...,...
xPlayStopped,float64
xRebound,float64
xShotWasOnGoal,float64
yCord,int64



Categorical Columns:

awayTeamCode - Unique Values: 32


awayTeamCode
TBL    1681
PIT     105
FLA      95
BOS      94
TOR      91
NJD      90
NYR      88
BUF      83
MTL      77
OTT      75
Name: count, dtype: int64


event - Unique Values: 3


event
SHOT    2111
MISS    1118
GOAL     287
Name: count, dtype: int64


goalieNameForShot - Unique Values: 54


goalieNameForShot
Charlie Lindgren     136
Linus Ullmark        124
Sergei Bobrovsky     121
Tristan Jarry        105
Joseph Woll          100
Connor Ingram         99
Anton Forsberg        97
Petr Mrazek           97
Connor Hellebuyck     96
Alex Lyon             96
Name: count, dtype: int64


homeTeamCode - Unique Values: 32


homeTeamCode
TBL    1835
CBJ      95
WSH      89
DET      86
MTL      85
BOS      85
OTT      81
BUF      78
TOR      74
NYI      72
Name: count, dtype: int64


lastEventCategory - Unique Values: 8


lastEventCategory
FAC       845
SHOT      681
HIT       592
BLOCK     545
MISS      397
GIVE      223
TAKE      210
DELPEN     23
Name: count, dtype: int64


lastEventTeam - Unique Values: 2


lastEventTeam
HOME    1779
AWAY    1737
Name: count, dtype: int64


location - Unique Values: 3


location
AWAYZONE     1799
HOMEZONE     1637
Neu. Zone      80
Name: count, dtype: int64


playerPositionThatDidEvent - Unique Values: 4


playerPositionThatDidEvent
C    1390
D     918
L     713
R     495
Name: count, dtype: int64


shooterLeftRight - Unique Values: 2


shooterLeftRight
L    2220
R    1230
Name: count, dtype: int64


shooterName - Unique Values: 34


shooterName
Nikita Kucherov      457
Steven Stamkos       370
Brayden Point        322
Victor Hedman        269
Brandon Hagel        259
Nicholas Paul        222
Michael Eyssimont    214
Anthony Cirelli      189
Darren Raddysh       158
Tyler Motte          122
Name: count, dtype: int64


shotType - Unique Values: 7


shotType
WRIST    1980
SLAP      524
SNAP      375
TIP       280
BACK      245
DEFL       56
WRAP       28
Name: count, dtype: int64


team - Unique Values: 2


team
HOME    1835
AWAY    1681
Name: count, dtype: int64


teamCode - Unique Values: 1


teamCode
TBL    3516
Name: count, dtype: int64

## Explore the physical location data provided for each shot.

xCord, yCord are the North/South and East/West coordinates on the ice of the shot, respectively.  North/South is measured from the red line (-89, 89 are the goal lines).  The middle of the ice has an East/West coordinate of 0.

In [None]:
import seaborn as sns

sns.set(style = "whitegrid")
plt.figure(figsize=(15,12))

sns.histplot(df['xCord'], kde=True, label = col, alpha = 0.6)
plt.title('Distribution of X Coordinates of shots')
plt.legend()