In [245]:
# Load libraries
import pandas as pd
import glob
import os
import numpy as np

In [246]:
# Define the folder path containing the Excel files
folder_path = "../../data/instat/"

# Pattern matching Excel files starting with "Games" and ending with ".xlsx"
file_pattern = os.path.join(folder_path, "Games*.xlsx")
files = glob.glob(file_pattern)

print("Found files:")
for f in files:
    print(f)

# Define the mapping from part of the filename to game type label
type_mapping = {
    "TotalOpp": "Total",
    "EVOpp": "Even Strength",
    "PPOpp": "Power Play",
    "PKOpp": "Penalty Kill"
}

# List to collect DataFrames after processing each file
dfs = []

# Specify the columns with merged cells.
merged_cols = ['Date', 'Opponent', 'Score', 'Penalty time']

# Process each file
for file in files:
    # Read the file using the openpyxl engine
    df = pd.read_excel(file, engine='openpyxl')
    print(f"\nProcessing file: {file}")
    print("Original DataFrame:")
    print(df.head())

    # Replace placeholder dashes '-' with NA in the merged columns
    df[merged_cols] = df[merged_cols].replace('-', pd.NA)

    # Forward-fill the Date, Opponent, and Score columns so that each game group has the same info
    df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')

    # For the "Penalty time" column, if the current row belongs to the same game (i.e. Date, Opponent, Score match),
    # then override its value with the previous row's value.
    for i in range(1, len(df)):
        if df.loc[i, ['Date', 'Opponent', 'Score']].equals(df.loc[i-1, ['Date', 'Opponent', 'Score']]):
            df.loc[i, 'Penalty time'] = df.loc[i-1, 'Penalty time']

    # Optionally remove rows that might represent "average per game" entries.
    avg_pattern = r'(?i)avg|average'
    mask = df[merged_cols].apply(lambda col: col.astype(str).str.contains(avg_pattern, na=False))
    df = df[~mask.any(axis=1)]

    # Infer the game type from the filename.
    base_name = os.path.basename(file)
    game_type = None
    for key, label in type_mapping.items():
        if key in base_name:
            game_type = label
            break
    # Fallback if no matching key is found.
    if game_type is None:
        game_type = "Unknown"

    # Add a new column indicating what the file represents
    df['Type'] = game_type

    # Append the processed DataFrame to our list
    dfs.append(df)

Found files:
../../data/instat\GamesEVOpp.xlsx
../../data/instat\GamesPKOpp.xlsx
../../data/instat\GamesPPOpp.xlsx
../../data/instat\GamesTotalOpp.xlsx

Processing file: ../../data/instat\GamesEVOpp.xlsx
Original DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    3.0        5.0   
1      -                              -     -         NK    0.0        4.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    1.0        1.0   
3      -                              -     -         WC    6.0        2.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    1.0        3.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              4.0        10:00        40            22  ...   
1              5.0        08:00        40            18  ...   
2              2.0        02:00        56            23  ...   
3              1.0        04:00        56          

  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')
  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')
  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')



Processing file: ../../data/instat\GamesPPOpp.xlsx
Original DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    1.0        0.0   
1      -                              -     -         NK    2.0        0.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    0.0        1.0   
3      -                              -     -         WC    1.0        0.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    0.0        1.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              0.0        00:00       4.0           4.0  ...   
1              1.0        00:00      17.0          11.0  ...   
2              0.0        02:00       6.0           2.0  ...   
3              1.0        00:00       2.0           1.0  ...   
4              1.0        02:00       5.0           4.0  ...   

  Loose puck recovery  Opponent’s dump-in retrievals  Entries  \
0        

  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')


### Concatenate all processed DataFrames into a single DataFrame.

In [247]:
games_opps_df = pd.concat(dfs, ignore_index=True)

print("\nConcatenated DataFrame:")
print(games_opps_df.head())


Concatenated DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    3.0        5.0   
1  22/02              @ Neumann Knights   4:2         NK    0.0        4.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    1.0        1.0   
3  21/02             vs Wilkes Colonels   1:7         WC    6.0        2.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    1.0        3.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              4.0        10:00      40.0          22.0  ...   
1              5.0        10:00      40.0          18.0  ...   
2              2.0        02:00      56.0          23.0  ...   
3              1.0        02:00      56.0          33.0  ...   
4              3.0        06:00      40.0          17.0  ...   

  Opponent’s dump-in retrievals  Entries  Entries via pass  \
0                          15.0     35.0              13.0 

Optionally, save the concatenated DataFrame to a new Excel or CSV file:

In [248]:

games_opps_df.to_excel("GamesOppCombined.xlsx", index=False)
# games_opps_df.to_csv("GamesOppCombined.csv", index=False)


In [249]:
games_opps_df = pd.read_excel("GamesOppCombined.xlsx")

## Data Cleaning

### NAs

In [250]:
# Replace '-' with np.nan for consistency in identifying null values
games_opps_df.replace('-', np.nan, inplace=True)

  games_opps_df.replace('-', np.nan, inplace=True)


In [251]:
# Count the number of null values in each column
null_counts = games_opps_df.isnull().sum()

# Display columns with null values and their counts
null_info = null_counts[null_counts > 0]
print("Columns with null values and their counts:")
print(null_info)

Columns with null values and their counts:
Faceoffs won, %                   7
Faceoffs won in DZ, %            44
Faceoffs won in NZ, %            67
Faceoffs won in OZ, %            47
CORSI%                           96
Power play                       96
Successful power play            96
Power play time                  96
Power play, %                   154
Short-handed                     96
Penalty killing                  96
Short-handed time                96
Short-handed, %                  98
% shots on goal                  26
Power play shots                 96
Short-handed shots               96
Puck battles won, %               2
Dekes successful, %              62
Accurate passes, %                8
OZ play with shots, %            47
Counter-attack with shots, %     84
EV OZ retrievals                 96
EV DZ retrievals                 96
dtype: int64


In [252]:
# Replace NAs in time-related columns with "0:00"
time_columns = [col for col in games_opps_df.columns if 'time' in col.lower()]
games_opps_df[time_columns] = games_opps_df[time_columns].fillna("0:00")

# Replace NAs in percentage-related columns with "0%"
percentage_columns = [col for col in games_opps_df.columns if '%' in col]
games_opps_df[percentage_columns] = games_opps_df[percentage_columns].fillna("0%")

# Replace all remaining NAs with 0
games_opps_df.fillna(0, inplace=True)

In [253]:
from IPython.display import display, HTML

# Get the data types of the columns in the DataFrame
column_dtypes = games_opps_df.dtypes

# Convert the data types Series to a DataFrame for better formatting
column_dtypes_df = column_dtypes.reset_index()
column_dtypes_df.columns = ['Column', 'Data Type']

# Display the DataFrame as a scrollable HTML table
display(HTML(column_dtypes_df[column_dtypes_df['Data Type'] == 'object'].to_html(index=False, max_rows=None, max_cols=None)))

Column,Data Type
Date,object
Opponent,object
Score,object
Unnamed: 3,object
Penalty time,object
"Faceoffs won, %",object
"Faceoffs won in DZ, %",object
"Faceoffs won in NZ, %",object
"Faceoffs won in OZ, %",object
CORSI%,object


In [254]:
# Remove '%' and convert percentage columns to numeric
for col in percentage_columns:
    games_opps_df[col] = games_opps_df[col].str.rstrip('%').astype(float) * 0.01

print("Percentage columns converted to numeric:")
print(games_opps_df[percentage_columns].dtypes)

Percentage columns converted to numeric:
Faceoffs won, %                 float64
Faceoffs won in DZ, %           float64
Faceoffs won in NZ, %           float64
Faceoffs won in OZ, %           float64
CORSI%                          float64
Power play, %                   float64
Short-handed, %                 float64
% shots on goal                 float64
Puck battles won, %             float64
Dekes successful, %             float64
Accurate passes, %              float64
OZ play with shots, %           float64
Counter-attack with shots, %    float64
dtype: object


In [255]:
# Display percentage columns after conversion
print(games_opps_df[percentage_columns].head())

   Faceoffs won, %  Faceoffs won in DZ, %  Faceoffs won in NZ, %  \
0             0.55                   0.38                   0.75   
1             0.45                   0.47                   0.25   
2             0.41                   0.43                   0.28   
3             0.59                   0.38                   0.72   
4             0.43                   0.42                   0.33   

   Faceoffs won in OZ, %  CORSI%  Power play, %  Short-handed, %  \
0                   0.53    0.62            0.0              0.0   
1                   0.62    0.38            0.0              0.0   
2                   0.63    0.14            0.0              0.0   
3                   0.57    0.86            0.0              0.0   
4                   0.75    0.17            0.0              0.0   

   % shots on goal  Puck battles won, %  Dekes successful, %  \
0             0.58                 0.57                 0.69   
1             0.54                 0.43               

In [256]:
# Filter non-numeric columns
non_numeric_columns = games_opps_df.select_dtypes(include='object')

# Display the non-numeric columns as a scrollable HTML table
display(HTML(non_numeric_columns.to_html(index=False, max_rows=5, max_cols=None)))

Date,Opponent,Score,Unnamed: 3,Penalty time,Power play time,Short-handed time,Offensive play,Defensive play,OZ possession,NZ possession,DZ possession,Type
22/02,@ Neumann Knights,4:2,HCB,10:00,0:00,0:00,21:25,16:16,11:03,04:28,05:54,Even Strength
22/02,@ Neumann Knights,4:2,NK,10:00,0:00,0:00,16:16,21:25,05:31,03:45,07:00,Even Strength
...,...,...,...,...,...,...,...,...,...,...,...,...
01/11/24,vs Hilbert College,3:2,HCB,08:00,04:00,08:00,29:48,22:25,14:03,05:40,10:04,Total
01/11/24,vs Hilbert College,3:2,HC,08:00,08:00,04:00,22:25,29:48,09:21,04:08,08:56,Total


In [None]:
# Define time columns to be split
time_columns = ['Penalty time', 'Power play time', 'Short-handed time', 'Offensive play', 'Defensive play', 'OZ possession', 'NZ possession', 'DZ possession']

# Split the time columns into minutes and seconds
for col in time_columns:
    split_time = games_opps_df[col].str.split(':', expand=True)
    games_opps_df[f'{col} (Minutes)'] = split_time[0].astype(int)
    games_opps_df[f'{col} (Seconds)'] = split_time[1].astype(int)

# Display the updated DataFrame with new columns
print(games_opps_df[[col] + [f'{col} (Minutes)', f'{col} (Seconds)']].head())

  DZ possession  DZ possession_minutes  DZ possession_seconds
0         05:54                      5                     54
1         07:00                      7                      0
2         09:50                      9                     50
3         07:04                      7                      4
4         10:01                     10                      1


In [258]:
# Drop the time columns from the DataFrame
games_opps_df.drop(columns=time_columns, inplace=True)

# Display the updated DataFrame
print(games_opps_df.head())

    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB      3          5   
1  22/02              @ Neumann Knights   4:2         NK      0          4   
2  21/02             vs Wilkes Colonels   1:7        HCB      1          1   
3  21/02             vs Wilkes Colonels   1:7         WC      6          2   
4  14/02  vs Arcadia University Knights   1:4        HCB      1          3   

   Penalties drawn  Faceoffs  Faceoffs won  Faceoffs won, %  ...  \
0                4        40            22             0.55  ...   
1                5        40            18             0.45  ...   
2                2        56            23             0.41  ...   
3                1        56            33             0.59  ...   
4                3        40            17             0.43  ...   

   Offensive play_minutes  Offensive play_seconds  Defensive play_minutes  \
0                      21                    

In [259]:
# Display the dataframe as a scrollable HTML table
display(HTML(games_opps_df.to_html(index=False, max_rows=5, max_cols=None, notebook=True)))

Date,Opponent,Score,Unnamed: 3,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,Faceoffs won in DZ,"Faceoffs won in DZ, %",Faceoffs in NZ,Faceoffs won in NZ,"Faceoffs won in NZ, %",Faceoffs in OZ,Faceoffs won in OZ,"Faceoffs won in OZ, %",Blocked shots,Faceoffs lost,Scoring chances,CORSI%,Hits against,Power play,Successful power play,"Power play, %",Short-handed,Penalty killing,"Short-handed, %",xG per shot,Opponent's xG per shot,Net xG (xG - Opponent's xG),xG conversion,xG (Expected goals),Opponent's xG,xG per goal,Opponent's xG per goal,Shots,Shots on goal,Shots blocking,Missed shots,% shots on goal,Slapshot,Wrist shot,Power play shots,Short-handed shots,Shootouts scored,Puck battles,Puck battles won,"Puck battles won, %",Puck battles in OZ,Puck battles in NZ,Puck battles in DZ,Dekes,Dekes successful,Dekes unsuccessful,"Dekes successful, %",Passes total,Accurate passes,"Accurate passes, %",Pre-shots passes,Dump ins,Dump outs,Passes to the slot,OZ play,OZ play with shots,"OZ play with shots, %",Counterattacks,Counter-attack with shots,"Counter-attack with shots, %",Takeaways,Takeaways in NZ,Takeaways in DZ,Puck losses,Puck losses in OZ,Puck losses in NZ,Puck losses in DZ,Retrievals,Power play retrievals,Penalty kill retrievals,EV OZ retrievals,EV DZ retrievals,Takeaways in OZ,Loose puck recovery,Opponent’s dump-in retrievals,Entries,Entries via pass,Entries via dump in,Entries via stickhandling,Breakouts,Breakouts via pass,Breakouts via dump out,Breakouts via stickhandling,Type,Penalty time_minutes,Penalty time_seconds,Power play time_minutes,Power play time_seconds,Short-handed time_minutes,Short-handed time_seconds,Offensive play_minutes,Offensive play_seconds,Defensive play_minutes,Defensive play_seconds,OZ possession_minutes,OZ possession_seconds,NZ possession_minutes,NZ possession_seconds,DZ possession_minutes,DZ possession_seconds
22/02,@ Neumann Knights,4:2,HCB,3,5,4,40,22,0.55,3,13,5,0.38,12,9,0.75,15,8,0.53,11,18,13,0.62,4,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.75,0.00,1.90,1.15,0.00,0.00,45,26,8,8,0.58,36,3,0.0,0.0,0,105,60,0.57,0,0,0,16,11,5,0.69,121,90,0.74,7,26,3,1,9,6,0.67,9,8,0.89,93,0,0,66,34,25,25,22,22,22,9.0,13.0,0,35,15,35,13,4,18,35,19,0,16,Even Strength,10,0,0,0,0,0,21,25,16,16,11,3,4,28,5,54
22/02,@ Neumann Knights,4:2,NK,0,4,5,40,18,0.45,4,15,7,0.47,12,3,0.25,13,8,0.62,8,22,10,0.38,3,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,-0.75,0.00,1.15,1.90,0.00,0.00,28,15,11,5,0.54,20,3,0.0,0.0,0,105,45,0.43,0,0,0,7,5,2,0.71,108,68,0.63,0,19,13,6,3,3,1.00,9,4,0.44,84,0,0,60,31,18,18,19,19,19,3.0,16.0,0,26,15,33,10,1,22,26,16,1,9,Even Strength,10,0,0,0,0,0,16,16,21,25,5,31,3,45,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
01/11/24,vs Hilbert College,3:2,HCB,3,4,2,71,42,0.59,0,26,14,0.54,19,12,0.63,26,16,0.62,13,29,26,0.63,1,2.0,0.0,0.0,3.0,3.0,1.0,0.06,0.04,2.20,0.68,4.40,2.20,1.48,1.12,75,51,12,11,0.68,7,47,4.0,0.0,0,264,127,0.48,112,57,95,16,13,3,0.81,248,203,0.82,8,23,27,8,16,13,0.81,5,4,0.80,129,38,59,82,43,26,13,44,1,5,25.0,13.0,32,49,8,49,14,4,31,41,21,2,18,Total,8,0,4,0,8,0,29,48,22,25,14,3,5,40,10,4
01/11/24,vs Hilbert College,3:2,HC,2,2,4,71,29,0.41,1,26,10,0.38,19,7,0.37,26,12,0.46,12,42,12,0.37,0,3.0,0.0,0.0,2.0,2.0,1.0,0.04,0.06,-2.20,0.89,2.20,4.40,1.12,1.48,56,39,13,5,0.70,9,34,14.0,0.0,0,264,137,0.52,95,57,112,16,15,1,0.94,221,157,0.71,6,14,29,3,7,7,1.00,8,4,0.50,115,31,57,87,40,29,18,26,2,2,7.0,15.0,27,43,14,36,8,3,25,32,17,0,15,Total,8,0,8,0,4,0,22,25,29,48,9,21,4,8,8,56


## Data Wrangling

In [260]:
games_opps_df.head()

Unnamed: 0,Date,Opponent,Score,Unnamed: 3,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",...,Offensive play_minutes,Offensive play_seconds,Defensive play_minutes,Defensive play_seconds,OZ possession_minutes,OZ possession_seconds,NZ possession_minutes,NZ possession_seconds,DZ possession_minutes,DZ possession_seconds
0,22/02,@ Neumann Knights,4:2,HCB,3,5,4,40,22,0.55,...,21,25,16,16,11,3,4,28,5,54
1,22/02,@ Neumann Knights,4:2,NK,0,4,5,40,18,0.45,...,16,16,21,25,5,31,3,45,7,0
2,21/02,vs Wilkes Colonels,1:7,HCB,1,1,2,56,23,0.41,...,15,25,32,11,2,29,3,7,9,50
3,21/02,vs Wilkes Colonels,1:7,WC,6,2,1,56,33,0.59,...,32,11,15,25,19,54,5,12,7,4
4,14/02,vs Arcadia University Knights,1:4,HCB,1,3,3,40,17,0.43,...,17,32,23,24,3,42,3,50,10,1


In [261]:
games_opps_df['isOpponent'] = games_opps_df['Unnamed: 3'] != 'HCB'
print(games_opps_df[['Unnamed: 3', 'isOpponent']].head())

  Unnamed: 3  isOpponent
0        HCB       False
1         NK        True
2        HCB       False
3         WC        True
4        HCB       False


In [262]:
# Ensure the 'Opponent' column is treated as string
games_opps_df['Opponent'] = games_opps_df['Opponent'].astype(str)

# Determine if the game is away based on '@' symbol
contains_at = games_opps_df['Opponent'].str.contains('@')
is_opponent = games_opps_df['isOpponent']

# Calculate the 'isAway' status
# If isOpponent is True, flip the away status (True if 'vs', False if '@')
# If isOpponent is False, use the original away status (True if '@', False if 'vs')
games_opps_df['isAway'] = np.where(is_opponent, ~contains_at, contains_at)

# Extract the opponent name
# Use regex to capture text after '@\n' or 'vs\n'
extracted_name = games_opps_df['Opponent'].str.extract(r'(?:@|vs)\s*(.*)')[0]

# Update the 'Opponent' column
# If isOpponent is True, set Opponent to 'Hood'
# If isOpponent is False, set Opponent to the extracted name
games_opps_df['Opponent'] = np.where(is_opponent, 'Hood', extracted_name)

# Display the relevant columns to verify
print(games_opps_df[['Opponent', 'isAway', 'isOpponent']].head())

                     Opponent  isAway  isOpponent
0             Neumann Knights    True       False
1                        Hood   False        True
2             Wilkes Colonels   False       False
3                        Hood    True        True
4  Arcadia University Knights   False       False


In [263]:
# Display the dataframe as a scrollable HTML table
display(HTML(games_opps_df.to_html(index=False, max_rows=5, max_cols=None, notebook=True)))

Date,Opponent,Score,Unnamed: 3,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,Faceoffs won in DZ,"Faceoffs won in DZ, %",Faceoffs in NZ,Faceoffs won in NZ,"Faceoffs won in NZ, %",Faceoffs in OZ,Faceoffs won in OZ,"Faceoffs won in OZ, %",Blocked shots,Faceoffs lost,Scoring chances,CORSI%,Hits against,Power play,Successful power play,"Power play, %",Short-handed,Penalty killing,"Short-handed, %",xG per shot,Opponent's xG per shot,Net xG (xG - Opponent's xG),xG conversion,xG (Expected goals),Opponent's xG,xG per goal,Opponent's xG per goal,Shots,Shots on goal,Shots blocking,Missed shots,% shots on goal,Slapshot,Wrist shot,Power play shots,Short-handed shots,Shootouts scored,Puck battles,Puck battles won,"Puck battles won, %",Puck battles in OZ,Puck battles in NZ,Puck battles in DZ,Dekes,Dekes successful,Dekes unsuccessful,"Dekes successful, %",Passes total,Accurate passes,"Accurate passes, %",Pre-shots passes,Dump ins,Dump outs,Passes to the slot,OZ play,OZ play with shots,"OZ play with shots, %",Counterattacks,Counter-attack with shots,"Counter-attack with shots, %",Takeaways,Takeaways in NZ,Takeaways in DZ,Puck losses,Puck losses in OZ,Puck losses in NZ,Puck losses in DZ,Retrievals,Power play retrievals,Penalty kill retrievals,EV OZ retrievals,EV DZ retrievals,Takeaways in OZ,Loose puck recovery,Opponent’s dump-in retrievals,Entries,Entries via pass,Entries via dump in,Entries via stickhandling,Breakouts,Breakouts via pass,Breakouts via dump out,Breakouts via stickhandling,Type,Penalty time_minutes,Penalty time_seconds,Power play time_minutes,Power play time_seconds,Short-handed time_minutes,Short-handed time_seconds,Offensive play_minutes,Offensive play_seconds,Defensive play_minutes,Defensive play_seconds,OZ possession_minutes,OZ possession_seconds,NZ possession_minutes,NZ possession_seconds,DZ possession_minutes,DZ possession_seconds,isOpponent,isAway
22/02,Neumann Knights,4:2,HCB,3,5,4,40,22,0.55,3,13,5,0.38,12,9,0.75,15,8,0.53,11,18,13,0.62,4,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.75,0.00,1.90,1.15,0.00,0.00,45,26,8,8,0.58,36,3,0.0,0.0,0,105,60,0.57,0,0,0,16,11,5,0.69,121,90,0.74,7,26,3,1,9,6,0.67,9,8,0.89,93,0,0,66,34,25,25,22,22,22,9.0,13.0,0,35,15,35,13,4,18,35,19,0,16,Even Strength,10,0,0,0,0,0,21,25,16,16,11,3,4,28,5,54,False,True
22/02,Hood,4:2,NK,0,4,5,40,18,0.45,4,15,7,0.47,12,3,0.25,13,8,0.62,8,22,10,0.38,3,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,-0.75,0.00,1.15,1.90,0.00,0.00,28,15,11,5,0.54,20,3,0.0,0.0,0,105,45,0.43,0,0,0,7,5,2,0.71,108,68,0.63,0,19,13,6,3,3,1.00,9,4,0.44,84,0,0,60,31,18,18,19,19,19,3.0,16.0,0,26,15,33,10,1,22,26,16,1,9,Even Strength,10,0,0,0,0,0,16,16,21,25,5,31,3,45,7,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
01/11/24,Hilbert College,3:2,HCB,3,4,2,71,42,0.59,0,26,14,0.54,19,12,0.63,26,16,0.62,13,29,26,0.63,1,2.0,0.0,0.0,3.0,3.0,1.0,0.06,0.04,2.20,0.68,4.40,2.20,1.48,1.12,75,51,12,11,0.68,7,47,4.0,0.0,0,264,127,0.48,112,57,95,16,13,3,0.81,248,203,0.82,8,23,27,8,16,13,0.81,5,4,0.80,129,38,59,82,43,26,13,44,1,5,25.0,13.0,32,49,8,49,14,4,31,41,21,2,18,Total,8,0,4,0,8,0,29,48,22,25,14,3,5,40,10,4,False,False
01/11/24,Hood,3:2,HC,2,2,4,71,29,0.41,1,26,10,0.38,19,7,0.37,26,12,0.46,12,42,12,0.37,0,3.0,0.0,0.0,2.0,2.0,1.0,0.04,0.06,-2.20,0.89,2.20,4.40,1.12,1.48,56,39,13,5,0.70,9,34,14.0,0.0,0,264,137,0.52,95,57,112,16,15,1,0.94,221,157,0.71,6,14,29,3,7,7,1.00,8,4,0.50,115,31,57,87,40,29,18,26,2,2,7.0,15.0,27,43,14,36,8,3,25,32,17,0,15,Total,8,0,8,0,4,0,22,25,29,48,9,21,4,8,8,56,True,True


In [264]:
# Split the 'Score' column into two parts: Score1 and Score2
score_split = games_opps_df['Score'].str.split(':', expand=True).astype(int)
# Determine win/loss/draw based on scores and isOpponent
games_opps_df['Outcome'] = np.where(
    score_split[0] == score_split[1], 'Draw',
    np.where(
        (games_opps_df['isOpponent'] & (score_split[1] > score_split[0])) |
        (~games_opps_df['isOpponent'] & (score_split[0] > score_split[1])),
        'Win',
        'Loss'
    )
)

# Display the updated DataFrame to verify
print(games_opps_df[['Score', 'isOpponent', 'Outcome']].head())

  Score  isOpponent Outcome
0   4:2       False     Win
1   4:2        True    Loss
2   1:7       False    Loss
3   1:7        True     Win
4   1:4       False    Loss


In [265]:
# Split the 'Score' column into two parts: Score1 and Score2
score_split = games_opps_df['Score'].str.split(':', expand=True).astype(int)

# Assign the opponent's score based on the 'isOpponent' column
games_opps_df['OpponentScore'] = np.where(games_opps_df['isOpponent'], score_split[0], score_split[1])

# Display the updated DataFrame to verify
print(games_opps_df[['Score', 'isOpponent', 'OpponentScore']].head())

  Score  isOpponent  OpponentScore
0   4:2       False              2
1   4:2        True              4
2   1:7       False              7
3   1:7        True              1
4   1:4       False              4


In [266]:
games_opps_df.drop(columns=['Score', 'Unnamed: 3'], inplace=True)

# Display the updated DataFrame to verify
print(games_opps_df.head())

    Date                    Opponent  Goals  Penalties  Penalties drawn  \
0  22/02             Neumann Knights      3          5                4   
1  22/02                        Hood      0          4                5   
2  21/02             Wilkes Colonels      1          1                2   
3  21/02                        Hood      6          2                1   
4  14/02  Arcadia University Knights      1          3                3   

   Faceoffs  Faceoffs won  Faceoffs won, %  Hits  Faceoffs in DZ  ...  \
0        40            22             0.55     3              13  ...   
1        40            18             0.45     4              15  ...   
2        56            23             0.41     0              30  ...   
3        56            33             0.59     1               8  ...   
4        40            17             0.43     0              24  ...   

   OZ possession_minutes  OZ possession_seconds  NZ possession_minutes  \
0                     11            

In [267]:
from datetime import datetime

# Function to reformat dates
def reformat_date(date):
    try:
        # If the date is missing the year, append the current year
        if len(date.split('/')) == 2:
            date += f"/{datetime.now().year % 100:02d}"
        # Parse the date in dd/mm/yy format and reformat to mm/dd/yy
        return datetime.strptime(date, '%d/%m/%y').strftime('%m/%d/%y')
    except ValueError:
        return None  # Handle invalid dates

# Apply the reformat_date function to the 'Date' column
games_opps_df['Date'] = games_opps_df['Date'].apply(reformat_date)

# Display the updated DataFrame to verify
print(games_opps_df[['Date']].head())

       Date
0  02/22/25
1  02/22/25
2  02/21/25
3  02/21/25
4  02/14/25


In [268]:
# Display the dataframe as a scrollable HTML table
display(HTML(games_opps_df.to_html(index=False, max_rows=5, max_cols=None, notebook=True)))

Date,Opponent,Goals,Penalties,Penalties drawn,Faceoffs,Faceoffs won,"Faceoffs won, %",Hits,Faceoffs in DZ,Faceoffs won in DZ,"Faceoffs won in DZ, %",Faceoffs in NZ,Faceoffs won in NZ,"Faceoffs won in NZ, %",Faceoffs in OZ,Faceoffs won in OZ,"Faceoffs won in OZ, %",Blocked shots,Faceoffs lost,Scoring chances,CORSI%,Hits against,Power play,Successful power play,"Power play, %",Short-handed,Penalty killing,"Short-handed, %",xG per shot,Opponent's xG per shot,Net xG (xG - Opponent's xG),xG conversion,xG (Expected goals),Opponent's xG,xG per goal,Opponent's xG per goal,Shots,Shots on goal,Shots blocking,Missed shots,% shots on goal,Slapshot,Wrist shot,Power play shots,Short-handed shots,Shootouts scored,Puck battles,Puck battles won,"Puck battles won, %",Puck battles in OZ,Puck battles in NZ,Puck battles in DZ,Dekes,Dekes successful,Dekes unsuccessful,"Dekes successful, %",Passes total,Accurate passes,"Accurate passes, %",Pre-shots passes,Dump ins,Dump outs,Passes to the slot,OZ play,OZ play with shots,"OZ play with shots, %",Counterattacks,Counter-attack with shots,"Counter-attack with shots, %",Takeaways,Takeaways in NZ,Takeaways in DZ,Puck losses,Puck losses in OZ,Puck losses in NZ,Puck losses in DZ,Retrievals,Power play retrievals,Penalty kill retrievals,EV OZ retrievals,EV DZ retrievals,Takeaways in OZ,Loose puck recovery,Opponent’s dump-in retrievals,Entries,Entries via pass,Entries via dump in,Entries via stickhandling,Breakouts,Breakouts via pass,Breakouts via dump out,Breakouts via stickhandling,Type,Penalty time_minutes,Penalty time_seconds,Power play time_minutes,Power play time_seconds,Short-handed time_minutes,Short-handed time_seconds,Offensive play_minutes,Offensive play_seconds,Defensive play_minutes,Defensive play_seconds,OZ possession_minutes,OZ possession_seconds,NZ possession_minutes,NZ possession_seconds,DZ possession_minutes,DZ possession_seconds,isOpponent,isAway,Outcome,OpponentScore
02/22/25,Neumann Knights,3,5,4,40,22,0.55,3,13,5,0.38,12,9,0.75,15,8,0.53,11,18,13,0.62,4,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.75,0.00,1.90,1.15,0.00,0.00,45,26,8,8,0.58,36,3,0.0,0.0,0,105,60,0.57,0,0,0,16,11,5,0.69,121,90,0.74,7,26,3,1,9,6,0.67,9,8,0.89,93,0,0,66,34,25,25,22,22,22,9.0,13.0,0,35,15,35,13,4,18,35,19,0,16,Even Strength,10,0,0,0,0,0,21,25,16,16,11,3,4,28,5,54,False,True,Win,2
02/22/25,Hood,0,4,5,40,18,0.45,4,15,7,0.47,12,3,0.25,13,8,0.62,8,22,10,0.38,3,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,-0.75,0.00,1.15,1.90,0.00,0.00,28,15,11,5,0.54,20,3,0.0,0.0,0,105,45,0.43,0,0,0,7,5,2,0.71,108,68,0.63,0,19,13,6,3,3,1.00,9,4,0.44,84,0,0,60,31,18,18,19,19,19,3.0,16.0,0,26,15,33,10,1,22,26,16,1,9,Even Strength,10,0,0,0,0,0,16,16,21,25,5,31,3,45,7,0,True,False,Loss,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11/01/24,Hilbert College,3,4,2,71,42,0.59,0,26,14,0.54,19,12,0.63,26,16,0.62,13,29,26,0.63,1,2.0,0.0,0.0,3.0,3.0,1.0,0.06,0.04,2.20,0.68,4.40,2.20,1.48,1.12,75,51,12,11,0.68,7,47,4.0,0.0,0,264,127,0.48,112,57,95,16,13,3,0.81,248,203,0.82,8,23,27,8,16,13,0.81,5,4,0.80,129,38,59,82,43,26,13,44,1,5,25.0,13.0,32,49,8,49,14,4,31,41,21,2,18,Total,8,0,4,0,8,0,29,48,22,25,14,3,5,40,10,4,False,False,Win,2
11/01/24,Hood,2,2,4,71,29,0.41,1,26,10,0.38,19,7,0.37,26,12,0.46,12,42,12,0.37,0,3.0,0.0,0.0,2.0,2.0,1.0,0.04,0.06,-2.20,0.89,2.20,4.40,1.12,1.48,56,39,13,5,0.70,9,34,14.0,0.0,0,264,137,0.52,95,57,112,16,15,1,0.94,221,157,0.71,6,14,29,3,7,7,1.00,8,4,0.50,115,31,57,87,40,29,18,26,2,2,7.0,15.0,27,43,14,36,8,3,25,32,17,0,15,Total,8,0,8,0,4,0,22,25,29,48,9,21,4,8,8,56,True,True,Loss,3


In [269]:
# Convert the 'Date' column to datetime format
games_opps_df['Date'] = pd.to_datetime(games_opps_df['Date'], format='%m/%d/%y', errors='coerce')

# Display the updated DataFrame to verify
print(games_opps_df[['Date']].head())

        Date
0 2025-02-22
1 2025-02-22
2 2025-02-21
3 2025-02-21
4 2025-02-14


In [270]:
# Convert the data types Series to a DataFrame for better formatting
column_dtypes_df = games_opps_df.dtypes.reset_index()
column_dtypes_df.columns = ['Column', 'Data Type']

# Display the DataFrame as a scrollable HTML table
display(HTML(column_dtypes_df.to_html(index=False, max_rows=None, max_cols=None)))

Column,Data Type
Date,datetime64[ns]
Opponent,object
Goals,int64
Penalties,int64
Penalties drawn,int64
Faceoffs,int64
Faceoffs won,int64
"Faceoffs won, %",float64
Hits,int64
Faceoffs in DZ,int64


In [271]:
games_opps_df.to_csv("GamesCleaned.csv", index=False)
games_opps_df.to_excel("GamesCleaned.xlsx", index=False)