In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix


In [22]:
# Load the data
file_path = r"../DataCleaning/Instat_DataCleaning/GamesCleaned.xlsx"
data = pd.read_excel(file_path)

# Display the first few rows of the data
data.head()

Unnamed: 0,Date,Opponent,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,...,OZ possession (Minutes),OZ possession (Seconds),NZ possession (Minutes),NZ possession (Seconds),DZ possession (Minutes),DZ possession (Seconds),isOpponent,isAway,Outcome,OpponentScore
0,2025-02-22,Neumann Knights,3,5,4,40,22,0.55,3,13,...,11,3,4,28,5,54,False,True,Win,2
1,2025-02-22,Hood,0,4,5,40,18,0.45,4,15,...,5,31,3,45,7,0,True,False,Loss,4
2,2025-02-21,Wilkes Colonels,1,1,2,56,23,0.41,0,30,...,2,29,3,7,9,50,False,False,Loss,7
3,2025-02-21,Hood,6,2,1,56,33,0.59,1,8,...,19,54,5,12,7,4,True,True,Win,1
4,2025-02-14,Arcadia University Knights,1,3,3,40,17,0.43,0,24,...,3,42,3,50,10,1,False,False,Loss,4


In [23]:
data = data[(data['Type'] == 'Total') & (~data['isOpponent'])]
data.head()

Unnamed: 0,Date,Opponent,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,...,OZ possession (Minutes),OZ possession (Seconds),NZ possession (Minutes),NZ possession (Seconds),DZ possession (Minutes),DZ possession (Seconds),isOpponent,isAway,Outcome,OpponentScore
144,2025-02-22,Neumann Knights,4,6,4,61,32,0.52,3,29,...,14,41,5,17,7,16,False,True,Win,2
146,2025-02-21,Wilkes Colonels,1,3,2,64,26,0.41,0,32,...,3,24,3,24,10,31,False,False,Loss,7
148,2025-02-14,Arcadia University Knights,1,4,4,52,22,0.42,0,31,...,5,25,4,40,11,48,False,False,Loss,4
150,2025-02-08,Alvernia Wolves,0,7,3,46,21,0.46,2,23,...,5,14,4,3,7,18,False,True,Loss,4
152,2025-02-07,Stevenson Mustangs,4,2,4,51,26,0.51,1,16,...,12,3,4,24,8,52,False,False,Loss,7


In [24]:
data['Outcome'] = data['Outcome'].replace({'Win': 1, 'Loss': 0})
data.head()


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Unnamed: 0,Date,Opponent,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,...,OZ possession (Minutes),OZ possession (Seconds),NZ possession (Minutes),NZ possession (Seconds),DZ possession (Minutes),DZ possession (Seconds),isOpponent,isAway,Outcome,OpponentScore
144,2025-02-22,Neumann Knights,4,6,4,61,32,0.52,3,29,...,14,41,5,17,7,16,False,True,1,2
146,2025-02-21,Wilkes Colonels,1,3,2,64,26,0.41,0,32,...,3,24,3,24,10,31,False,False,0,7
148,2025-02-14,Arcadia University Knights,1,4,4,52,22,0.42,0,31,...,5,25,4,40,11,48,False,False,0,4
150,2025-02-08,Alvernia Wolves,0,7,3,46,21,0.46,2,23,...,5,14,4,3,7,18,False,True,0,4
152,2025-02-07,Stevenson Mustangs,4,2,4,51,26,0.51,1,16,...,12,3,4,24,8,52,False,False,0,7


In [25]:
data = data.drop(columns=['Goals', 'Date', 'isOpponent', 'OpponentScore'], errors='ignore')

In [26]:
# Prepare features and target
X = data.select_dtypes(include=[np.number]).drop('Outcome', axis=1)
y = data['Outcome']

# Set up cross‐validation and model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=1000)

# Evaluate with cross‐validation
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("CV accuracy scores:", scores)
print("Mean CV accuracy:", scores.mean())

# (Optional) Train on full data and inspect performance
model.fit(X, y)
y_pred = model.predict(X)
print("Train accuracy:", accuracy_score(y, y_pred))
print("Confusion matrix:\n", confusion_matrix(y, y_pred))

CV accuracy scores: [0.8  1.   1.   1.   0.75]
Mean CV accuracy: 0.9099999999999999
Train accuracy: 1.0
Confusion matrix:
 [[19  0]
 [ 0  5]]


In [27]:
# Determine the most important features from the trained logistic regression
coef_series = pd.Series(model.coef_[0], index=X.columns)
# Rank by absolute value and show top 10
top_feats = coef_series.abs().sort_values(ascending=False).head(10)
print("Top 10 features by importance (absolute coefficient):")
print(top_feats)
# If you want the signed coefficients too:
print("\nSigned coefficients:")
print(coef_series[top_feats.index])

Top 10 features by importance (absolute coefficient):
Shots                       0.110006
Shots on goal               0.079670
Passes total                0.071386
Puck battles in DZ          0.064423
Offensive play (Seconds)    0.060463
Scoring chances             0.057169
Puck losses                 0.054571
Slapshot                    0.053529
OZ possession (Seconds)     0.052906
DZ possession (Seconds)     0.050816
dtype: float64

Signed coefficients:
Shots                       0.110006
Shots on goal               0.079670
Passes total               -0.071386
Puck battles in DZ         -0.064423
Offensive play (Seconds)    0.060463
Scoring chances             0.057169
Puck losses                -0.054571
Slapshot                    0.053529
OZ possession (Seconds)     0.052906
DZ possession (Seconds)    -0.050816
dtype: float64


In [28]:
import plotly.express as px

# 1) Bar chart of the top feature coefficients (signed)
signed_coefs = coef_series[top_feats.index]
coef_df = signed_coefs.reset_index()
coef_df.columns = ['feature', 'coefficient']

fig_coefs = px.bar(
    coef_df,
    x='feature',
    y='coefficient',
    title='Top 10 Feature Coefficients in Logistic Regression',
    labels={'feature':'Feature', 'coefficient':'Coefficient'}
)
fig_coefs.update_layout(xaxis_tickangle=45)
fig_coefs.show()

# 2) Violin plots of each top feature by Outcome
melt_df = data[top_feats.index.tolist() + ['Outcome']] \
    .melt(id_vars='Outcome', var_name='feature', value_name='value')

fig_violin = px.violin(
    melt_df,
    x='feature',
    y='value',
    color='Outcome',
    box=True,
    points='all',
    title='Distribution of Top Features by Outcome'
)
fig_violin.update_xaxes(tickangle=45)
fig_violin.show()

In [29]:
import joblib

# Save the trained logistic regression model to disk
model_filename = 'logistic_regression_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to logistic_regression_model.joblib


In [30]:
# Create a synthetic game with the same feature set as X
# Start with zeros for all numeric features
synthetic_game = pd.DataFrame(
    np.zeros((1, X.shape[1])),
    columns=X.columns
)

# Assign some example values to key features
synthetic_game.loc[0, 'Shots'] = 45
synthetic_game.loc[0, 'Shots on goal'] = 35
synthetic_game.loc[0, 'Passes total'] = 300
synthetic_game.loc[0, 'Puck battles in DZ'] = 75
synthetic_game.loc[0, 'Offensive play (Seconds)'] = 40
synthetic_game.loc[0, 'Scoring chances'] = 30
synthetic_game.loc[0, 'Puck losses'] = 20
synthetic_game.loc[0, 'Slapshot'] = 20
synthetic_game.loc[0, 'OZ possession (Seconds)'] = 300
synthetic_game.loc[0, 'DZ possession (Seconds)'] = 20

# Predict outcome
pred = model.predict(synthetic_game)[0]
pred_proba = model.predict_proba(synthetic_game)[0, 1]

print(f"Predicted outcome: {'Win' if pred==1 else 'Loss'}")
print(f"Probability of win: {pred_proba:.2f}")

Predicted outcome: Win
Probability of win: 0.61
