## All Imports

In [3]:
import warnings; warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np; seed = 2023; np.random.seed(seed) # BE CAREFUL TO ENSURE SEED IS CORRECT!!!!
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression, LassoCV, SGDClassifier, Ridge, Lasso, LogisticRegression, LogisticRegressionCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer, RocCurveDisplay, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_curve, r2_score, mean_absolute_error, roc_auc_score, auc, roc_curve, mean_absolute_percentage_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from scipy.stats import norm
%matplotlib inline

## Loading Dataset & Basic Information

#### Loading & Head

In [23]:
# Load CSV
file_name = 'hockey.csv'
df = pd.read_csv(file_name)

# Print the head of the DataFrame
print("Head of DataFrame:")
df.head(5)

Head of DataFrame:


Unnamed: 0,opposingTeam,home_or_away,icetime,gameScore,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotAttempts,I_F_goals,I_F_rebounds,I_F_reboundGoals,...,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_unblockedShotAttempts,I_F_dZoneGiveaways,penalityMinutesDrawn,penaltiesDrawn
0,STL,AWAY,118.0,-0.07,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,STL,AWAY,1087.0,-0.07,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2,STL,AWAY,861.0,-0.07,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,STL,AWAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,STL,AWAY,108.0,-0.07,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Shape

In [24]:
# Print out the shape of the DataFrame
print("Shape of DataFrame:")
df.shape

Shape of DataFrame:


(2725, 27)

#### Unique Items

In [25]:
# Print out unique number of items in a column
print("Unique number of items in a column:")
df['opposingTeam'].nunique()

Unique number of items in a column:


35

#### Average

In [26]:
# Print out the average of a column
print("Average of a column:")
df['icetime'].mean()

Average of a column:


522.1974311926606

#### Specific Value Count In Column

In [27]:
# Print out the number of specific items in a column
print("Number of specific items in a column:")
print(len(df[df.home_or_away == 'HOME']))

Number of specific items in a column:
1365


#### Descriptive Statistics

In [28]:
# Descriptive statistics
print("Descriptive statistics:")
df.describe()

Descriptive statistics:


Unnamed: 0,icetime,gameScore,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotAttempts,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playContinuedInZone,...,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_unblockedShotAttempts,I_F_dZoneGiveaways,penalityMinutesDrawn,penaltiesDrawn
count,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,...,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0
mean,522.197431,1.280536,0.242936,0.136147,2.147523,0.206239,0.112294,0.016881,0.243303,0.553028,...,0.253578,0.394495,0.349358,0.051376,0.084037,0.070826,1.737982,0.08367,0.399266,0.199633
std,542.475717,1.131206,0.537567,0.378618,2.595227,0.487017,0.389679,0.13713,0.563739,0.93071,...,0.638285,0.784607,0.738448,0.235292,0.289155,0.274551,2.193379,0.309494,0.94907,0.469088
min,0.0,-0.905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,51.0,0.345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,200.0,1.125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1067.0,2.01,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
max,1763.0,4.995,4.0,3.0,14.0,4.0,4.0,2.0,4.0,6.0,...,7.0,7.0,6.0,3.0,2.0,2.0,13.0,3.0,6.0,3.0


#### Datatypes & Categorical Types

In [29]:
# Type of each column
print("Type of each column:")
df.dtypes

Type of each column:


opposingTeam                       object
home_or_away                       object
icetime                           float64
gameScore                         float64
I_F_primaryAssists                float64
I_F_secondaryAssists              float64
I_F_shotAttempts                  float64
I_F_goals                         float64
I_F_rebounds                      float64
I_F_reboundGoals                  float64
I_F_freeze                        float64
I_F_playContinuedInZone           float64
I_F_playContinuedOutsideZone      float64
I_F_savedShotsOnGoal              float64
I_F_savedUnblockedShotAttempts    float64
I_F_penalityMinutes               float64
I_F_faceOffsWon                   float64
I_F_hits                          float64
I_F_takeaways                     float64
I_F_giveaways                     float64
I_F_lowDangerGoals                float64
I_F_mediumDangerGoals             float64
I_F_highDangerGoals               float64
I_F_unblockedShotAttempts         

In [33]:
# Print out the variables that are categorical
print("Categorical variables:")
df.select_dtypes(include=['object']).columns

Categorical variables:


Index(['opposingTeam', 'home_or_away'], dtype='object')

#### Null Values & Duplicate Rows

In [39]:
# Print out if there are nan values and how many
df.info()

# Print out the total number of nan values
print("Total number of nan values: " + str(df.isnull().sum().sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2725 entries, 0 to 2724
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   opposingTeam                    2725 non-null   object 
 1   home_or_away                    2725 non-null   object 
 2   icetime                         2725 non-null   float64
 3   gameScore                       2725 non-null   float64
 4   I_F_primaryAssists              2725 non-null   float64
 5   I_F_secondaryAssists            2725 non-null   float64
 6   I_F_shotAttempts                2725 non-null   float64
 7   I_F_goals                       2725 non-null   float64
 8   I_F_rebounds                    2725 non-null   float64
 9   I_F_reboundGoals                2725 non-null   float64
 10  I_F_freeze                      2725 non-null   float64
 11  I_F_playContinuedInZone         2725 non-null   float64
 12  I_F_playContinuedOutsideZone    27

In [36]:
# Print out if there are any duplicate rows in the dataframe
print("Are there any duplicate rows in the dataframe? " + str(df.duplicated().any()))

# Print out the number of duplicate rows in the dataframe
print("Number of duplicate rows in the dataframe:")
df.duplicated().sum()

Are there any duplicate rows in the dataframe? True
Number of duplicate rows in the dataframe:


231

## Processing Dataset

## Plotting Data