# Import Required Libraries
Import the necessary libraries, including pandas.

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set style for seaborn
sns.set(style="whitegrid")

In [7]:
lines_files = {
    "../data/instat/LinesBasic.xlsx": "Basic",
    "../data/instat/LinesD.xlsx": "DF",
    "../data/instat/LinesF.xlsx": "FW",
    "../data/instat/LinesPK.xlsx": "PK",
    "../data/instat/LinesPP.xlsx": "PP"
}

lines_list = []

for file, line_type in lines_files.items():
    df = pd.read_excel(file)
    df["Type"] = line_type
    lines_list.append(df)

lines_df = pd.concat(lines_list, ignore_index=True)

print(lines_df.head())

lines_df.to_excel("../data/instat/Combined_Lines.xlsx", index=False)

                                                Line Plus/Minus  \
0  4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...          -   
1  15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...          -   
2  26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...          -   
3  15 D. Ormiston, 16 L. Scardicchio, 8 A. Bazyle...          -   
4  15 D. Ormiston, 11 S. Potratz, 42 A. Reider, 6...          -   

   Numbers of shifts Time on ice Goals Opponent's goals Shots Shots on goal  \
0                4.0       01:45     -                -     4             3   
1                1.0       00:43     -                -     4             4   
2                2.0       00:29     -                -     4             1   
3                2.0       00:47     -                -     3             3   
4                2.0       01:16     -                -     3             2   

  Opponent shots total Shots on goal against CORSI CORSI+ CORSI-  \
0                    -                     -     4    

# Load the Excel File
Load the Excel file using an absolute path and display the first few rows of the dataframe.

In [8]:
# Load the Excel File
lines_df = pd.read_excel('../data/instat/Combined_Lines.xlsx')

# Display the first few rows of the dataframe
lines_df.head()

Unnamed: 0,Line,Plus/Minus,Numbers of shifts,Time on ice,Goals,Opponent's goals,Shots,Shots on goal,Opponent shots total,Shots on goal against,CORSI,CORSI+,CORSI-,Short-handed play,Power-play played,Power play time,Successful power play,Short-handed time,Type
0,"4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...",-,4.0,01:45,-,-,4,3,-,-,4,4,-,-,-,-,-,-,Basic
1,"15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...",-,1.0,00:43,-,-,4,4,-,-,4,4,-,-,-,-,-,-,Basic
2,"26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...",-,2.0,00:29,-,-,4,1,-,-,4,4,-,-,-,-,-,-,Basic
3,"15 D. Ormiston, 16 L. Scardicchio, 8 A. Bazyle...",-,2.0,00:47,-,-,3,3,-,-,3,3,-,-,-,-,-,-,Basic
4,"15 D. Ormiston, 11 S. Potratz, 42 A. Reider, 6...",-,2.0,01:16,-,-,3,2,-,-,3,3,-,-,-,-,-,-,Basic


# Replace '-' with 0
Replace '-' values with 0 in the dataframe.

In [9]:
# Replace '-' with 0
lines_df.replace('-', 0, inplace=True)

# Display the first few rows of the dataframe to verify the changes
lines_df.head()

  lines_df.replace('-', 0, inplace=True)


Unnamed: 0,Line,Plus/Minus,Numbers of shifts,Time on ice,Goals,Opponent's goals,Shots,Shots on goal,Opponent shots total,Shots on goal against,CORSI,CORSI+,CORSI-,Short-handed play,Power-play played,Power play time,Successful power play,Short-handed time,Type
0,"4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...",0.0,4.0,01:45,0.0,0.0,4.0,3.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic
1,"15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...",0.0,1.0,00:43,0.0,0.0,4.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic
2,"26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...",0.0,2.0,00:29,0.0,0.0,4.0,1.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic
3,"15 D. Ormiston, 16 L. Scardicchio, 8 A. Bazyle...",0.0,2.0,00:47,0.0,0.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic
4,"15 D. Ormiston, 11 S. Potratz, 42 A. Reider, 6...",0.0,2.0,01:16,0.0,0.0,3.0,2.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic


# Convert '0' Values to '00:00'
Convert '0' values to '00:00' in specified time columns if they exist.

In [10]:
# Convert '0' values to '00:00' in specified time columns if they exist
time_columns = ['Time on ice', 'Power play time']
for col in time_columns:
    if col in lines_df.columns:
        lines_df[col] = lines_df[col].replace('0', '00:00')
        # Fill NaN values with '00:00' before splitting
        lines_df[col] = lines_df[col].fillna('00:00')
        # Split the time into minutes and seconds
        lines_df[[f'{col} (mins)', f'{col} (secs)']] = lines_df[col].str.split(':', expand=True)
        # Fill NaN values in the new columns with 0 and convert to integers
        lines_df[f'{col} (mins)'] = lines_df[f'{col} (mins)'].fillna(0).astype(int)
        lines_df[f'{col} (secs)'] = lines_df[f'{col} (secs)'].fillna(0).astype(int)
    else:
        print(f"Column '{col}' not found in DataFrame")

# Display the first few rows of the dataframe to verify the changes
lines_df.head()

Unnamed: 0,Line,Plus/Minus,Numbers of shifts,Time on ice,Goals,Opponent's goals,Shots,Shots on goal,Opponent shots total,Shots on goal against,CORSI,CORSI+,CORSI-,Short-handed play,Power-play played,Power play time,Successful power play,Short-handed time,Type,Time on ice (mins),Time on ice (secs),Power play time (mins),Power play time (secs)
0,"4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...",0.0,4.0,01:45,0.0,0.0,4.0,3.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,1,45,0,0
1,"15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...",0.0,1.0,00:43,0.0,0.0,4.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,0,43,0,0
2,"26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...",0.0,2.0,00:29,0.0,0.0,4.0,1.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,0,29,0,0
3,"15 D. Ormiston, 16 L. Scardicchio, 8 A. Bazyle...",0.0,2.0,00:47,0.0,0.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic,0,47,0,0
4,"15 D. Ormiston, 11 S. Potratz, 42 A. Reider, 6...",0.0,2.0,01:16,0.0,0.0,3.0,2.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic,1,16,0,0


# Split Time Columns into Minutes and Seconds
Split the time columns into minutes and seconds, fill NaN values with 0, and convert to integers.

In [11]:
# Split the time columns into minutes and seconds, fill NaN values with 0, and convert to integers
time_columns = ['Time on ice', 'Power play time']
for col in time_columns:
    if col in lines_df.columns:
        lines_df[col] = lines_df[col].replace('0', '00:00')
        # Fill NaN values with '00:00' before splitting
        lines_df[col] = lines_df[col].fillna('00:00')
        # Split the time into minutes and seconds
        lines_df[[f'{col} (mins)', f'{col} (secs)']] = lines_df[col].str.split(':', expand=True)
        # Fill NaN values in the new columns with 0 and convert to integers
        lines_df[f'{col} (mins)'] = lines_df[f'{col} (mins)'].fillna(0).astype(int)
        lines_df[f'{col} (secs)'] = lines_df[f'{col} (secs)'].fillna(0).astype(int)
    else:
        print(f"Column '{col}' not found in DataFrame")

# Display the first few rows of the dataframe to verify the changes
lines_df.head()

Unnamed: 0,Line,Plus/Minus,Numbers of shifts,Time on ice,Goals,Opponent's goals,Shots,Shots on goal,Opponent shots total,Shots on goal against,CORSI,CORSI+,CORSI-,Short-handed play,Power-play played,Power play time,Successful power play,Short-handed time,Type,Time on ice (mins),Time on ice (secs),Power play time (mins),Power play time (secs)
0,"4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...",0.0,4.0,01:45,0.0,0.0,4.0,3.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,1,45,0,0
1,"15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...",0.0,1.0,00:43,0.0,0.0,4.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,0,43,0,0
2,"26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...",0.0,2.0,00:29,0.0,0.0,4.0,1.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0,0.0,0,Basic,0,29,0,0
3,"15 D. Ormiston, 16 L. Scardicchio, 8 A. Bazyle...",0.0,2.0,00:47,0.0,0.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic,0,47,0,0
4,"15 D. Ormiston, 11 S. Potratz, 42 A. Reider, 6...",0.0,2.0,01:16,0.0,0.0,3.0,2.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0,0.0,0,Basic,1,16,0,0


In [12]:
import pandas as pd

# Optimal 5v5 line formations based on CORSI
optimal_5v5_lines = lines_df.sort_values(by='CORSI', ascending=False).head(5)
optimal_5v5_lines['Rank'] = range(1, len(optimal_5v5_lines) + 1)
print("Optimal 5v5 Line Formations:")
print(optimal_5v5_lines[['Rank', 'Line', 'CORSI']])

# Optimal power play line formations based on goals
optimal_power_play_lines = lines_df.sort_values(by='Goals', ascending=False).head(5)
optimal_power_play_lines['Rank'] = range(1, len(optimal_power_play_lines) + 1)
print("\nOptimal Power Play Line Formations:")
print(optimal_power_play_lines[['Rank', 'Line', 'Goals']])

# Optimal penalty kill line formations based on shots on goal against
optimal_penalty_kill_lines = lines_df.sort_values(by="Opponent's goals", ascending=True).head(5)
optimal_penalty_kill_lines['Rank'] = range(1, len(optimal_penalty_kill_lines) + 1)
print("\nOptimal Penalty Kill Line Formations:")
print(optimal_penalty_kill_lines[['Rank', 'Line', "Opponent's goals"]])

# Calculate the best offensive lines based on goals scored
best_offensive_lines = lines_df.sort_values(by='Goals', ascending=False).head(5).reset_index().rename(columns={'index': 'Original Index'})
best_offensive_lines['Rank'] = range(1, len(best_offensive_lines) + 1)
print("Best Offensive Lines:")
print(best_offensive_lines[['Rank', 'Original Index', 'Line', 'Goals']])

# Calculate the best defensive lines based on goals against
best_defensive_lines = lines_df.sort_values(by="Opponent's goals", ascending=True).head(5).reset_index().rename(columns={'index': 'Original Index'})
best_defensive_lines['Rank'] = range(1, len(best_defensive_lines) + 1)
print("\nBest Defensive Lines:")
print(best_defensive_lines[['Rank', 'Original Index', 'Line', "Opponent's goals"]])


Optimal 5v5 Line Formations:
     Rank                                               Line  CORSI
2       1  26 J. Marano, 4 K. MacKinnon, 77 M. Cheesebrou...    4.0
854     2  47 K. Kratzer, 16 L. Scardicchio, 19 L. Szczub...    4.0
0       3  4 K. MacKinnon, 18 A. Wilbur, 6 A. Wilson, 10 ...    4.0
1       4  15 D. Ormiston, 11 S. Potratz, 6 A. Wilson, 12...    4.0
945     5             11 S. Potratz, 12 E. Bynan, 88 R. Gall    4.0

Optimal Power Play Line Formations:
      Rank                                               Line  Goals
52       1  26 J. Marano, 19 L. Szczubiala, 18 A. Wilbur, ...    1.0
1476     2  15 D. Ormiston, 47 K. Kratzer, 12 E. Bynan, 77...    1.0
861      3       47 K. Kratzer, 4 K. MacKinnon, 11 S. Potratz    1.0
80       4  47 K. Kratzer, 4 K. MacKinnon, 11 S. Potratz, ...    1.0
955      5          47 K. Kratzer, 18 A. Wilbur, 13 S. Ancona    1.0

Optimal Penalty Kill Line Formations:
      Rank                                               Line  \
1456    

In [14]:
lines_df.to_excel('../DataCleaning/lines_cleaned.xlsx', index=False)