In [None]:
import pandas as pd
import glob
import os

# Define the folder path containing the Excel files
folder_path = "../../data/instat/"

# Pattern matching Excel files starting with "Games" and ending with ".xlsx"
file_pattern = os.path.join(folder_path, "Games*.xlsx")
files = glob.glob(file_pattern)

print("Found files:")
for f in files:
    print(f)

# Define the mapping from part of the filename to game type label
type_mapping = {
    "TotalOpp": "Total",
    "EVOpp": "Even Strength",
    "PPOpp": "Power Play",
    "PKOpp": "Penalty Kill"
}

# List to collect DataFrames after processing each file
dfs = []

# Specify the columns with merged cells.
merged_cols = ['Date', 'Opponent', 'Score', 'Penalty time']

# Process each file
for file in files:
    # Read the file using the openpyxl engine
    df = pd.read_excel(file, engine='openpyxl')
    print(f"\nProcessing file: {file}")
    print("Original DataFrame:")
    print(df.head())

    # Replace placeholder dashes '-' with NA in the merged columns
    df[merged_cols] = df[merged_cols].replace('-', pd.NA)

    # Forward-fill the Date, Opponent, and Score columns so that each game group has the same info
    df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')

    # For the "Penalty time" column, if the current row belongs to the same game (i.e. Date, Opponent, Score match),
    # then override its value with the previous row's value.
    for i in range(1, len(df)):
        if df.loc[i, ['Date', 'Opponent', 'Score']].equals(df.loc[i-1, ['Date', 'Opponent', 'Score']]):
            df.loc[i, 'Penalty time'] = df.loc[i-1, 'Penalty time']

    # Optionally remove rows that might represent "average per game" entries.
    avg_pattern = r'(?i)avg|average'
    mask = df[merged_cols].apply(lambda col: col.astype(str).str.contains(avg_pattern, na=False))
    df = df[~mask.any(axis=1)]

    # Infer the game type from the filename.
    base_name = os.path.basename(file)
    game_type = None
    for key, label in type_mapping.items():
        if key in base_name:
            game_type = label
            break
    # Fallback if no matching key is found.
    if game_type is None:
        game_type = "Unknown"

    # Add a new column indicating what the file represents
    df['Type'] = game_type

    # Append the processed DataFrame to our list
    dfs.append(df)

# Concatenate all processed DataFrames into a single DataFrame.
games_df = pd.concat(dfs, ignore_index=True)

print("\nConcatenated DataFrame:")
print(games_df.head())

# Optionally, save the concatenated DataFrame to a new Excel or CSV file:
# games_df.to_excel("ConcatenatedGamesData.xlsx", index=False)
# games_df.to_csv("ConcatenatedGamesData.csv", index=False)


Found files:
../../data/instat\GamesEVOpp.xlsx
../../data/instat\GamesPKOpp.xlsx
../../data/instat\GamesPPOpp.xlsx
../../data/instat\GamesTotalOpp.xlsx

Processing file: ../../data/instat\GamesEVOpp.xlsx
Original DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    3.0        5.0   
1      -                              -     -         NK    0.0        4.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    1.0        1.0   
3      -                              -     -         WC    6.0        2.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    1.0        3.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              4.0        10:00        40            22  ...   
1              5.0        08:00        40            18  ...   
2              2.0        02:00        56            23  ...   
3              1.0        04:00        56          

  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')



Processing file: ../../data/instat\GamesPKOpp.xlsx
Original DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    0.0        1.0   
1      -                              -     -         NK    0.0        0.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    0.0        1.0   
3      -                              -     -         WC    0.0        0.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    0.0        0.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              0.0        02:00      17.0           6.0  ...   
1              0.0        00:00       4.0           0.0  ...   
2              0.0        02:00       2.0           1.0  ...   
3              1.0        00:00       6.0           4.0  ...   
4              0.0        00:00       7.0           1.0  ...   

  Loose puck recovery  Opponent’s dump-in retrievals  Entries  \
0        

  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')
  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')



Processing file: ../../data/instat\GamesPPOpp.xlsx
Original DataFrame:
    Date                       Opponent Score Unnamed: 3  Goals  Penalties  \
0  22/02              @ Neumann Knights   4:2        HCB    1.0        0.0   
1      -                              -     -         NK    2.0        0.0   
2  21/02             vs Wilkes Colonels   1:7        HCB    0.0        1.0   
3      -                              -     -         WC    1.0        0.0   
4  14/02  vs Arcadia University Knights   1:4        HCB    0.0        1.0   

   Penalties drawn Penalty time  Faceoffs  Faceoffs won  ...  \
0              0.0        00:00       4.0           4.0  ...   
1              1.0        00:00      17.0          11.0  ...   
2              0.0        02:00       6.0           2.0  ...   
3              1.0        00:00       2.0           1.0  ...   
4              1.0        02:00       5.0           4.0  ...   

  Loose puck recovery  Opponent’s dump-in retrievals  Entries  \
0        

  df[['Date', 'Opponent', 'Score']] = df[['Date', 'Opponent', 'Score']].fillna(method='ffill')


In [30]:
df.head(50)


Unnamed: 0,Date,Opponent,Score,Unnamed: 3,Goals,Penalties,Penalties drawn,Penalty time,Faceoffs,Faceoffs won,...,Opponent’s dump-in retrievals,Entries,Entries via pass,Entries via dump in,Entries via stickhandling,Breakouts,Breakouts via pass,Breakouts via dump out,Breakouts via stickhandling,Game Type
0,22/02,@ Neumann Knights,4:2,HCB,4.0,6.0,4.0,12:00,61,32,...,18,39,13,4.0,22,41,21,0.0,20,Total
1,22/02,@ Neumann Knights,4:2,NK,2.0,4.0,6.0,12:00,61,29,...,16,43,11,2.0,30,34,19,1.0,14,Total
2,21/02,vs Wilkes Colonels,1:7,HCB,1.0,3.0,2.0,06:00,64,26,...,11,23,5,1.0,17,35,12,3.0,20,Total
3,21/02,vs Wilkes Colonels,1:7,WC,7.0,2.0,3.0,06:00,64,38,...,12,55,15,9.0,31,31,18,1.0,12,Total
4,14/02,vs Arcadia University Knights,1:4,HCB,1.0,4.0,4.0,08:00,52,22,...,16,25,7,4.0,14,43,23,1.0,19,Total
5,14/02,vs Arcadia University Knights,1:4,AUK,4.0,4.0,4.0,08:00,52,30,...,18,61,13,5.0,43,41,30,0.0,11,Total
6,08/02,@ Alvernia Wolves,0:4,HCB,0.0,7.0,3.0,37:00,46,21,...,10,22,3,4.0,15,25,15,1.0,9,Total
7,08/02,@ Alvernia Wolves,0:4,AW,4.0,3.0,7.0,37:00,46,25,...,15,45,10,5.0,30,36,22,0.0,14,Total
8,07/02,vs Stevenson Mustangs,4:7,HCB,4.0,2.0,4.0,04:00,51,26,...,11,40,12,7.0,21,37,22,1.0,14,Total
9,07/02,vs Stevenson Mustangs,4:7,SM,7.0,4.0,2.0,04:00,51,25,...,20,57,15,1.0,41,50,31,3.0,16,Total
