In [17]:
import pandas as pd
import numpy as np

In [18]:
def load_data(filepath):
    """Load data from a CSV file."""
    return pd.read_csv(filepath)

def get_summary_stats(data, group_by, columns):
    """Calculate summary statistics for specified columns grouped by a given column."""
    return data.groupby(group_by)[columns].describe()

def filter_missed_shots(data, summary_stats, columns):
    """Filter out missed shots based on 25th percentile thresholds for each club type."""
    filtered_data = data.copy()
    for club_type, stats in summary_stats.iterrows():
        # Generate a combined condition for each column's threshold for the current club type
        condition = pd.Series([False] * len(data))
        for col in columns:
            condition |= (data['Club Type'] == club_type) & (data[col] < stats[col]['25%'])
        
        # Filter data
        filtered_data = filtered_data.loc[~condition]
    return filtered_data

def calculate_group_means(data, group_by):
    """Calculate mean of all numerical columns grouped by a specified column while retaining the grouping column."""
    # Ensure 'Club Type' is included for grouping
    numeric_data = data.select_dtypes(include=[np.number])
    group_column = data[[group_by]]  # Retain the group column as a DataFrame
    combined_data = pd.concat([group_column, numeric_data], axis=1)  # Combine numeric columns with the group column
    return combined_data.groupby(group_by).mean()

In [19]:
# Load the dataset
file_path = "/Users/rokbohinc/Documents/Work/Golf_AI/Golfshot_Categoriser/data/extracted/metrics/MergedMetrics.csv"
golf_data = load_data(file_path)

# Define columns of interest for missed shots
columns_of_interest = ['Apex Height', 'Carry Distance']

# Get summary statistics for filtering thresholds
summary = get_summary_stats(golf_data, 'Club Type', columns_of_interest)

# Filter out missed shots
clean_data = filter_missed_shots(golf_data, summary, columns_of_interest)

# Calculate group means for remaining data
group_means = calculate_group_means(clean_data, 'Club Type')

## Mean of each parameter for each golf club

In [20]:
group_means

Unnamed: 0_level_0,Unnamed: 0,Club Speed,Attack Angle,Club Path,Club Face,Face to Path,Ball Speed,Smash Factor,Launch Angle,Launch Direction,...,Carry Distance,Carry Deviation Angle,Carry Deviation Distance,Total Distance,Total Deviation Angle,Total Deviation Distance,Air Density,Temperature,Air Pressure,Relative Humidity
Club Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3 Wood,41.666667,148.109996,-0.225,-0.601667,0.541667,1.143333,191.945995,1.298915,16.516667,0.368333,...,161.014999,5.561667,15.03,170.581662,5.756667,16.255,1.12378,28.33,97.98,50.0
5 Hybrid,37.930233,126.167439,0.781163,1.639302,1.976512,0.337209,165.432553,1.308281,19.970714,2.000465,...,130.022858,2.412857,5.420952,137.03619,2.422143,5.789048,1.167993,17.00186,97.458372,36.976744
5 Iron,22.947368,134.768839,-0.604211,0.212632,1.548947,1.336316,169.895364,1.265068,14.607895,1.300526,...,131.168421,2.514211,5.203684,144.73421,2.532105,5.521579,1.136474,24.939474,97.656842,30.210526
5 Wood,16.857143,132.330854,1.555714,2.18,4.505714,2.325714,182.499424,1.379245,16.168571,3.974286,...,151.1,6.108571,15.714285,159.152858,6.137143,16.6,1.136194,24.601429,97.637143,39.428571
6 Iron,38.916667,124.621997,2.388889,5.9975,0.752778,-5.244722,161.369497,1.295191,18.458611,1.368472,...,122.783334,-1.566111,-3.9025,128.911112,-1.7525,-4.700694,1.138852,25.647639,98.000556,25.208333
7 Iron,25.864608,129.096272,-0.010843,0.918076,0.877657,-0.040227,163.703213,1.26956,18.463286,0.862475,...,127.774763,0.597604,1.006935,137.927147,0.562473,0.97983,1.1799,14.858671,97.808397,54.130641
8 Iron,53.326087,120.323736,2.355652,9.212391,-0.251087,-9.463478,157.458516,1.31113,19.917826,0.946087,...,120.124564,-3.388261,-7.386304,127.843913,-3.648043,-8.426304,1.136112,26.436739,98.187609,41.26087
9 Iron,30.804651,114.838657,-2.379395,-2.224651,-0.27,1.954651,138.32221,inf,23.896372,-0.473814,...,97.060791,-0.052698,-0.875209,104.692279,-0.019814,-0.853907,1.160812,20.298651,97.78614,4.4
Driver,3.25,147.536996,4.0375,-0.6775,0.805,1.4825,200.555995,1.360567,17.34,0.575,...,180.705002,-2.7525,-8.6425,199.942501,-3.1275,-11.144999,1.12378,28.33,97.98,50.0
Pitching Wedge,14.38961,114.038413,0.092895,4.039474,-1.144539,-5.184013,133.885867,1.180563,26.537386,-0.461039,...,97.094902,-2.978366,-5.443595,104.461765,-3.124837,-6.137255,1.149963,23.26513,98.260714,49.207792


### Number of golf shots per category

#### Iron 7

In [21]:
df_I7 = golf_data[golf_data["Club Type"] == "7 Iron"]
df_I7["shot_type"].value_counts()

shot_type
STRAIGHT          136
STRAIGHT_FADE      88
STRAIGHT_HOOK      75
STRAIGHT_DRAW      72
STRAIGHT_SLICE     49
PUSH               41
PUSH_SLICE         37
PUSH_FADE          36
PULL_HOOK          35
PULL_DRAW          26
PULL               21
PUSH_DRAW           8
PUSH_HOOK           7
PULL_FADE           2
PULL_SLICE          1
Name: count, dtype: int64

#### All clubs

In [22]:
golf_data["shot_type"].value_counts()

shot_type
STRAIGHT          446
STRAIGHT_DRAW     190
STRAIGHT_FADE     184
STRAIGHT_HOOK     184
STRAIGHT_SLICE    112
PULL_HOOK         108
PUSH               84
PULL               77
PUSH_SLICE         76
PUSH_FADE          61
PULL_DRAW          60
PUSH_DRAW          29
PUSH_HOOK          21
PULL_FADE          13
PULL_SLICE          3
Name: count, dtype: int64