## Clustering

First, I am going to run clustering on batter position in the box. My initial idea was to just use euclidean distance, but that doesn't fully capture the essence of where the batter stands. There is more we would like to know:
- Does the batter crowd the plate?
- Do they like to stand closer to the pitcher to beat the break of the pitch?
- How do they angle in the box? Do they face the pitcher more? (Think Manny Machado)
- How far apart are their feet? Does width and stability really matter?
- Are they righty or lefty?

Applying all of these to clustering should give more accurate clusters.

In [73]:
## Importing libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import os
os.environ["OMP_NUM_THREADS"] = "1"
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px


In [62]:
## Loading in the data
data = pd.read_csv("bat_stance_full_data.csv")

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 191 non-null    int64  
 1   id                         191 non-null    int64  
 2   name                       191 non-null    object 
 3   bat_side                   191 non-null    object 
 4   side                       191 non-null    object 
 5   avg_batter_y_position      191 non-null    float64
 6   avg_batter_x_position      191 non-null    float64
 7   avg_foot_sep               191 non-null    float64
 8   avg_stance_angle           191 non-null    float64
 9   avg_intercept_y_vs_batter  191 non-null    float64
 10  avg_intercept_y_vs_plate   191 non-null    float64
 11  player_id                  191 non-null    float64
 12  attempts                   191 non-null    float64
 13  avg_hit_angle              191 non-null    float64

In [64]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,bat_side,side,avg_batter_y_position,avg_batter_x_position,avg_foot_sep,avg_stance_angle,avg_intercept_y_vs_batter,...,home_run,k_percent,bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,babip,rbi,total_distance
0,0,646240,"Devers, Rafael",L,L,22.279378,26.748551,30.539244,-62.2128,24.976058,...,2.0,28.7,16.7,0.213,0.36,0.343,0.703,0.298,12.0,34.811717
1,1,671218,"Ramos, Heliot",R,R,25.126909,26.331432,27.614108,-3.089953,27.658252,...,3.0,25.5,5.9,0.221,0.389,0.275,0.664,0.273,9.0,36.396509
2,2,596115,"Story, Trevor",R,R,25.374647,29.54474,35.944251,-23.651709,34.736567,...,5.0,26.9,3.2,0.337,0.528,0.366,0.894,0.424,14.0,38.945659
3,3,694192,"Chourio, Jackson",R,R,24.504448,29.431268,40.658062,1.125542,28.706127,...,5.0,24.3,1.9,0.26,0.51,0.272,0.782,0.296,19.0,38.297095
4,4,666182,"Bichette, Bo",R,R,26.98413,25.910743,36.238398,2.111014,24.651267,...,0.0,14.8,4.6,0.31,0.39,0.352,0.742,0.365,13.0,37.410024


In [65]:
## Choosing only the variables we want specifically for clustering
cluster_data = data[['player_id', 'name','avg_batter_y_position', 'avg_batter_x_position', 'bat_side', 'avg_foot_sep', 'avg_stance_angle', 'total_distance']]

In [66]:
## One-hot encoding the bat_side variable
cluster_data = pd.get_dummies(cluster_data, columns=['bat_side'], drop_first=False)


In [67]:
## Scaling the data
cols_to_scale = cluster_data.columns.drop(['player_id', 'name'])

scaler = StandardScaler()
cluster_data[cols_to_scale] = scaler.fit_transform(cluster_data[cols_to_scale])
cluster_data.head()

Unnamed: 0,player_id,name,avg_batter_y_position,avg_batter_x_position,avg_foot_sep,avg_stance_angle,total_distance,bat_side_L,bat_side_R
0,646240.0,"Devers, Rafael",-1.367402,-0.360209,0.083382,-4.220017,-1.438881,1.104957,-1.104957
1,671218.0,"Ramos, Heliot",-0.633509,-0.518228,-0.443707,0.716261,-0.925279,-0.905012,0.905012
2,596115.0,"Story, Trevor",-0.56966,0.699085,1.057326,-1.000478,-0.099145,-0.905012,0.905012
3,694192.0,"Chourio, Jackson",-0.793936,0.656098,1.906722,1.068221,-0.309333,-0.905012,0.905012
4,666182.0,"Bichette, Bo",-0.154849,-0.6776,1.110329,1.1505,-0.596817,-0.905012,0.905012


In [68]:
## Converting the data to arrays for KMeans cluster
cluster_array = cluster_data.drop(columns=['player_id', 'name']).values


In [69]:
## Running the Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_data['cluster'] = kmeans.fit_predict(cluster_array)

In [70]:
## Introducing the cluster back into the full data
data['cluster'] = cluster_data['cluster']

In [71]:
data.sample(5)

Unnamed: 0.1,Unnamed: 0,id,name,bat_side,side,avg_batter_y_position,avg_batter_x_position,avg_foot_sep,avg_stance_angle,avg_intercept_y_vs_batter,...,k_percent,bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,babip,rbi,total_distance,cluster
135,142,805779,"Wilson, Jacob",R,R,28.167075,26.662485,29.220876,-47.155834,29.237548,...,3.7,0.0,0.354,0.5,0.354,0.854,0.351,10.0,38.784948,2
106,108,700932,"Manzardo, Kyle",L,L,22.570532,31.045514,35.357626,-4.349129,33.861267,...,24.7,14.1,0.217,0.58,0.329,0.909,0.182,16.0,38.382976,1
66,66,607208,"Turner, Trea",R,R,27.365713,28.246507,39.405683,0.309842,29.044935,...,19.4,11.8,0.263,0.35,0.366,0.716,0.328,6.0,39.32871,4
143,152,502054,"Pham, Tommy",R,R,31.949628,28.273065,35.996453,-5.590494,31.775066,...,27.3,10.4,0.149,0.194,0.247,0.441,0.213,5.0,42.663157,0
183,267,669242,"Edman, Tommy",R,R,26.069742,28.867889,23.475877,-5.745683,29.9126,...,14.9,3.2,0.261,0.545,0.298,0.843,0.235,15.0,38.897126,0


In [78]:
print(cluster_data['cluster'].unique())
print(cluster_data['cluster'].value_counts())


[3 4 0 2 1]
cluster
1    75
4    38
2    34
0    28
3    16
Name: count, dtype: int64


In [None]:

# Ensure 'cluster' is string type for discrete coloring
cluster_data['cluster'] = cluster_data['cluster'].astype(str)

# Select numeric features (exclude player_id, 'cluster' is already non-numeric)
features = cluster_data.select_dtypes(include=['number']).drop(columns=['player_id'])

# Run t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
cluster_2d = tsne.fit_transform(features)

# Step 4: Add t-SNE coordinates to DataFrame
cluster_data['PC1'] = cluster_2d[:, 0]
cluster_data['PC2'] = cluster_2d[:, 1]

# Step 5: Create interactive scatter plot
fig = px.scatter(
    cluster_data,
    x='PC1',
    y='PC2',
    color='cluster',
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['name', 'player_id'],
    title='t-SNE Visualization of K-Means Clusters',
    opacity=0.75
)

# Optional styling tweaks
fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='black')))
fig.update_layout(template='plotly_white')

# Show plot
fig.show()




## What Now?

Now that our clusters have been returned, let's look at them a bit deeper, before even looking at names and what my opinions and views of each player are based on history, 2025 stats aside, I am going to see how their data really compares to one another.

In [33]:
## Learning about the statistics of each cluster
numeric_cols = data.select_dtypes(include='number').columns.drop(['id', 'player_id', 'Unnamed: 0'])
grouped_data = data[numeric_cols].groupby(data['cluster']).mean()
grouped_data.head()

Unnamed: 0_level_0,avg_batter_y_position,avg_batter_x_position,avg_foot_sep,avg_stance_angle,avg_intercept_y_vs_batter,avg_intercept_y_vs_plate,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,...,k_percent,bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,babip,rbi,total_distance,cluster
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,29.237199,29.704848,29.306866,-3.798799,30.954862,1.775532,60.357143,13.185714,34.875,110.621429,...,21.503571,9.096429,0.255036,0.40825,0.328393,0.736643,0.301321,10.714286,41.753624,0.0
1,28.749892,27.737155,30.870425,-10.813841,30.832984,2.101165,59.653333,13.184,34.513333,110.172,...,21.353333,10.445333,0.245947,0.41948,0.33004,0.74952,0.284493,10.32,40.087507,1.0
2,29.681284,27.362187,27.428817,-23.878509,31.174546,1.530036,59.058824,13.7,33.141176,110.094118,...,22.073529,8.902941,0.228059,0.383588,0.302324,0.685912,0.263971,9.588235,40.478143,2.0
3,19.949308,25.131449,28.120419,-17.918583,29.122314,9.173006,63.625,12.41875,33.3375,107.3,...,20.4625,10.1625,0.249812,0.399938,0.330062,0.73,0.290563,10.875,32.31864,3.0
4,25.407622,27.530065,32.269279,-5.599034,30.502221,5.142136,62.0,13.834211,34.357895,109.852632,...,20.505263,8.265789,0.237237,0.403737,0.305868,0.709605,0.270368,9.842105,37.579815,4.0


In [37]:
## Specifically interested first in the basic statistics
grouped_data[['batting_avg', 'slg_percent', 'on_base_percent', 'on_base_plus_slg', 'k_percent', 'bb_percent', 'avg_hr_distance', 'ev50', 'total_distance']]

Unnamed: 0_level_0,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,k_percent,bb_percent,avg_hr_distance,ev50,total_distance
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.255036,0.40825,0.328393,0.736643,21.503571,9.096429,364.892857,100.792857,41.753624
1,0.245947,0.41948,0.33004,0.74952,21.353333,10.445333,366.813333,100.669333,40.087507
2,0.228059,0.383588,0.302324,0.685912,22.073529,8.902941,371.058824,100.735294,40.478143
3,0.249812,0.399938,0.330062,0.73,20.4625,10.1625,344.1875,98.5875,32.31864
4,0.237237,0.403737,0.305868,0.709605,20.505263,8.265789,331.973684,100.418421,37.579815


In [40]:
pd.DataFrame(data[data['cluster'] == 0]['name'])


Unnamed: 0,name
5,"Riley, Austin"
9,"Castellanos, Nick"
16,"Adames, Willy"
25,"Judge, Aaron"
45,"Vargas, Miguel"
47,"Vaughn, Andrew"
54,"Chapman, Matt"
60,"Volpe, Anthony"
64,"Contreras, William"
65,"Guerrero Jr., Vladimir"


In [41]:
pd.DataFrame(data[data['cluster'] == 1]['name'])


Unnamed: 0,name
8,"Tucker, Kyle"
10,"Turang, Brice"
12,"Greene, Riley"
15,"Crow-Armstrong, Pete"
17,"Giménez, Andrés"
...,...
165,"De La Cruz, Elly"
166,"Toglia, Michael"
167,"Edman, Tommy"
168,"Rutschman, Adley"


In [42]:
pd.DataFrame(data[data['cluster'] == 2]['name'])


Unnamed: 0,name
7,"Machado, Manny"
11,"Rooker, Brent"
13,"Torkelson, Spencer"
14,"Perez, Salvador"
26,"Swanson, Dansby"
28,"Tatis Jr., Fernando"
29,"Walker, Jordan"
31,"Walker, Christian"
32,"Paredes, Isaac"
37,"Trout, Mike"


In [43]:
pd.DataFrame(data[data['cluster'] == 3]['name'])


Unnamed: 0,name
0,"Devers, Rafael"
27,"Arozarena, Randy"
35,"Suárez, Eugenio"
42,"Lee, Jung Hoo"
52,"Santander, Anthony"
53,"Schwarber, Kyle"
58,"Altuve, Jose"
112,"Scott II, Victor"
123,"Rengifo, Luis"
147,"Frazier, Adam"


In [44]:
pd.DataFrame(data[data['cluster'] == 4]['name'])


Unnamed: 0,name
1,"Ramos, Heliot"
2,"Story, Trevor"
3,"Chourio, Jackson"
4,"Bichette, Bo"
6,"Rodríguez, Julio"
18,"Witt Jr., Bobby"
24,"Contreras, Willson"
33,"Alonso, Pete"
36,"García, Adolis"
51,"Bregman, Alex"
