## Predict player performance metrics such as points, goals, assists, or other relevant statistics for the upcoming season based on historical data.

In [2]:
import pandas as pd

In [3]:
#read the data into the dataframe
players_df = pd.read_csv("../hockey_starting_data/Scoring.csv")

In [4]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45967 entries, 0 to 45966
Data columns (total 31 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   playerID  45967 non-null  object 
 1   year      45967 non-null  int64  
 2   stint     45967 non-null  int64  
 3   tmID      45967 non-null  object 
 4   lgID      45967 non-null  object 
 5   pos       45385 non-null  object 
 6   GP        45699 non-null  float64
 7   G         45699 non-null  float64
 8   A         45699 non-null  float64
 9   Pts       45699 non-null  float64
 10  PIM       45699 non-null  float64
 11  +/-       36265 non-null  float64
 12  PPG       37748 non-null  float64
 13  PPA       23040 non-null  float64
 14  SHG       37744 non-null  float64
 15  SHA       23214 non-null  float64
 16  GWG       36567 non-null  float64
 17  GTG       28106 non-null  float64
 18  SOG       36364 non-null  float64
 19  PostGP    19153 non-null  float64
 20  PostG     19153 non-null  fl

In [5]:
#parse the numerical and categorical values
numeric_columns = players_df.select_dtypes(include=['int64', 'float64']).columns
numeric_columns = numeric_columns.drop('year')

categorical_columns = players_df.select_dtypes(include=['object']).columns

In [6]:
#groupby playerID and sum
player_grouped_df = players_df.groupby('playerID')[numeric_columns].sum()

In [7]:
#groupby playerID and get the categorical columns
categorical_data = players_df.groupby('playerID')[categorical_columns].first()

In [8]:
#rejoin the categorical data with the numerical data
merged_df = player_grouped_df.join(categorical_data, on='playerID',how='left')

In [9]:
merged_df

Unnamed: 0_level_0,stint,GP,G,A,Pts,PIM,+/-,PPG,PPA,SHG,...,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG,playerID,tmID,lgID,pos
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaltoan01,4,151.0,11.0,17.0,28.0,52.0,-25.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,aaltoan01,ANA,NHL,C
abbeybr01,1,17.0,1.0,0.0,1.0,12.0,-3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,abbeybr01,CIN,WHA,D
abbotge01,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,abbotge01,BOS,NHL,G
abbotre01,1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,abbotre01,MTL,NHL,C
abdelju01,5,209.0,18.0,29.0,47.0,160.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,47.0,abdelju01,DET,NHL,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuckeja01,1,6.0,0.0,2.0,2.0,2.0,-2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,zuckeja01,MIN,NHL,L
zukemi01,10,541.0,112.0,234.0,346.0,269.0,-41.0,23.0,0.0,6.0,...,3.0,0.0,0.0,0.0,1.0,0.0,zukemi01,IND,WHA,C
zukwa01,1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,zukwa01,EDO,WHA,C
zunicru01,1,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,zunicru01,DET,NHL,D


In [10]:
#drop the playerID column and reset the index of the datafram
merged_df.drop('playerID', axis=1, inplace=True)
merged_df.reset_index(inplace=True)

In [11]:
merged_df.head()

Unnamed: 0,playerID,stint,GP,G,A,Pts,PIM,+/-,PPG,PPA,...,Post+/-,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG,tmID,lgID,pos
0,aaltoan01,4,151.0,11.0,17.0,28.0,52.0,-25.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ANA,NHL,C
1,abbeybr01,1,17.0,1.0,0.0,1.0,12.0,-3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CIN,WHA,D
2,abbotge01,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BOS,NHL,G
3,abbotre01,1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MTL,NHL,C
4,abdelju01,5,209.0,18.0,29.0,47.0,160.0,8.0,0.0,0.0,...,-6.0,0.0,0.0,0.0,0.0,0.0,47.0,DET,NHL,L


In [12]:
#find any column headers with "post"
post_columns = merged_df.filter(like='Post').columns

#create a new column for with 1 for has postseason stats and 0 for post season stats
merged_df['postseason'] = (merged_df[post_columns].sum(axis=1) != 0).astype(int)

In [13]:
#sample the data set
merged_df.head()

Unnamed: 0,playerID,stint,GP,G,A,Pts,PIM,+/-,PPG,PPA,...,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG,tmID,lgID,pos,postseason
0,aaltoan01,4,151.0,11.0,17.0,28.0,52.0,-25.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,ANA,NHL,C,1
1,abbeybr01,1,17.0,1.0,0.0,1.0,12.0,-3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,CIN,WHA,D,0
2,abbotge01,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,BOS,NHL,G,0
3,abbotre01,1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,MTL,NHL,C,0
4,abdelju01,5,209.0,18.0,29.0,47.0,160.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,47.0,DET,NHL,L,1


In [14]:
#inspect the data
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7520 entries, 0 to 7519
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   playerID    7520 non-null   object 
 1   stint       7520 non-null   int64  
 2   GP          7520 non-null   float64
 3   G           7520 non-null   float64
 4   A           7520 non-null   float64
 5   Pts         7520 non-null   float64
 6   PIM         7520 non-null   float64
 7   +/-         7520 non-null   float64
 8   PPG         7520 non-null   float64
 9   PPA         7520 non-null   float64
 10  SHG         7520 non-null   float64
 11  SHA         7520 non-null   float64
 12  GWG         7520 non-null   float64
 13  GTG         7520 non-null   float64
 14  SOG         7520 non-null   float64
 15  PostGP      7520 non-null   float64
 16  PostG       7520 non-null   float64
 17  PostA       7520 non-null   float64
 18  PostPts     7520 non-null   float64
 19  PostPIM     7520 non-null  

In [15]:
#remove null values and reinspect the dataset
no_null_df = merged_df.dropna()
no_null_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7359 entries, 0 to 7519
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   playerID    7359 non-null   object 
 1   stint       7359 non-null   int64  
 2   GP          7359 non-null   float64
 3   G           7359 non-null   float64
 4   A           7359 non-null   float64
 5   Pts         7359 non-null   float64
 6   PIM         7359 non-null   float64
 7   +/-         7359 non-null   float64
 8   PPG         7359 non-null   float64
 9   PPA         7359 non-null   float64
 10  SHG         7359 non-null   float64
 11  SHA         7359 non-null   float64
 12  GWG         7359 non-null   float64
 13  GTG         7359 non-null   float64
 14  SOG         7359 non-null   float64
 15  PostGP      7359 non-null   float64
 16  PostG       7359 non-null   float64
 17  PostA       7359 non-null   float64
 18  PostPts     7359 non-null   float64
 19  PostPIM     7359 non-null   floa

In [17]:
from sklearn.preprocessing import StandardScaler

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
#encode the categorical values
data_encoded = pd.get_dummies( no_null_df, columns=['playerID', 'tmID', 'lgID', 'pos'])

In [29]:
#ensure that there are no object values in the data
data_encoded.info()
data_encoded.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7359 entries, 0 to 7519
Columns: 7524 entries, stint to pos_W
dtypes: bool(7497), float64(25), int64(2)
memory usage: 54.2 MB


Unnamed: 0,stint,GP,G,A,Pts,PIM,+/-,PPG,PPA,SHG,...,pos_F/D,pos_G,pos_L,pos_L/C,pos_L/D,pos_R,pos_R/C,pos_R/D,pos_R/L,pos_W
0,4,151.0,11.0,17.0,28.0,52.0,-25.0,3.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,1,17.0,1.0,0.0,1.0,12.0,-3.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,False,False
3,1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,5,209.0,18.0,29.0,47.0,160.0,8.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False


In [31]:
features = data_encoded[['Pts', 'G', 'A']]

In [32]:
#Scale the features using StandardScaler
scaler = StandardScaler()

In [33]:
#fit and transform the features
scaled_features = scaler.fit_transform(features)

In [35]:
scaled_data = pd.DataFrame(scaled_features, columns=['Pts', 'G','A'])
scaled_data.head()

Unnamed: 0,Pts,G,A
0,-0.409471,-0.384074,-0.41
1,-0.534778,-0.497404,-0.538268
2,-0.539419,-0.508737,-0.538268
3,-0.539419,-0.508737,-0.538268
4,-0.321293,-0.304743,-0.319458


In [36]:
# Define features and target variables
X = data_encoded.drop(columns=['Pts', 'G', 'A'])  # Features
y_pts = scaled_data['Pts']  # Target variable for points
y_g = scaled_data['G']  # Target variable for goals
y_a = scaled_data['A']  # Target variable for assists

In [37]:
# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(X, y_pts, test_size=0.2, random_state=42)
X_train, X_test, y_g_train, y_g_test = train_test_split(X, y_g, test_size=0.2, random_state=42)
X_train, X_test, y_a_train, y_a_test = train_test_split(X, y_a, test_size=0.2, random_state=42)

In [38]:
# Train linear regression models
lr_pts = LinearRegression()
lr_pts.fit(X_train, y_pts_train)

lr_g = LinearRegression()
lr_g.fit(X_train, y_g_train)

lr_a = LinearRegression()
lr_a.fit(X_train, y_a_train)


In [39]:
# Make predictions
y_pts_pred = lr_pts.predict(X_test)
y_g_pred = lr_g.predict(X_test)
y_a_pred = lr_a.predict(X_test)

In [40]:
# Evaluate the models
mse_pts = mean_squared_error(y_pts_test, y_pts_pred)
r2_pts = r2_score(y_pts_test, y_pts_pred)

mse_g = mean_squared_error(y_g_test, y_g_pred)
r2_g = r2_score(y_g_test, y_g_pred)

mse_a = mean_squared_error(y_a_test, y_a_pred)
r2_a = r2_score(y_a_test, y_a_pred)

In [41]:
#print the mean squared error
print(f"Points - MSE: {mse_pts}, R^2: {r2_pts}")
print(f"Goals - MSE: {mse_g}, R^2: {r2_g}")
print(f"Assists - MSE: {mse_a}, R^2: {r2_a}")

Points - MSE: 0.030251470616897112, R^2: 0.9723857153251324
Goals - MSE: 0.04619429727149426, R^2: 0.9562362379662341
Assists - MSE: 0.0366607669265998, R^2: 0.9672078622024614


## Conclusion
### Based on the current evaluation of Mean Squared Error (MSE) and R-squared (R²) values, the linear regression model shows promising performance in predicting player performance metrics such as points, goals, and assists. The MSE values for points, goals, and assists are relatively low, indicating that the model's predictions are close to the actual values. Additionally, the R² scores are high, suggesting that the model explains a large portion of the variance in player performance metrics based on historical data. Overall, the model demonstrates strong predictive capabilities for player performance metrics, but further validation and consideration of recent trends and factors are advised when using the model for predictions in the present day.