In [1]:
import pandas as pd
import altair as alt
import statsmodels.formula.api as smf


In [2]:
# Player Attributes Sheet from https://docs.google.com/spreadsheets/d/1CnPsdnZc4lFObYpu6R5lEYzRLmIBh-I03JFsTX6AaXE/edit#gid=108546025
# fielding data from SIBR discord
# players from Geography Google Drive (SIBR Discord)

fielding_raw = pd.read_csv("../data/fielding-thru-game-72.csv").assign(
    fielder=lambda d: d["fielder"]
    .str.replace("&#x27;", "'")
    .str.replace("Jr", "Jr.")
    .str.replace("al-", "Al-")
    .str.replace("O'c", "O'C")
)

attributes_raw = pd.read_csv(
    "../data/Player Attributes Sheet - All Players.csv", header=1
)
# last 13 rows are aggregates
attributes_raw = attributes_raw.drop(attributes_raw.tail(13).index, axis="rows")
# pitchers don't field
attributes_raw = attributes_raw.query("Position != 'Pitcher'")

players = pd.read_csv("../data/players.csv").rename(columns={"Player": "Name"})


In [3]:
dtypes = {
    "Rating": float,
    "sight": float,
    "thwack": float,
    "fero": float,
    "Batting": float,
    "control": float,
    "stuff": float,
    "guile": float,
    "Pitching": float,
    "reach": float,
    "magnet": float,
    "reflex": float,
    "Defense": float,
    "hustle": float,
    "stealth": float,
    "dodge": float,
    "Running": float,
    "thrive": float,
    "survive": float,
    "drama": float,
    "Vibes": float,
}

attributes_raw = attributes_raw.astype(dtypes)


In [4]:
hit_against_fielder_loc = (
    fielding_raw.groupby(["fielder", "hit_loc"])["hit"].sum().to_frame()
)


In [5]:
centroids = {
    "The Wall": (5, 5),
    "Right Field": (4, 1),
    "Left Field": (1, 4),
    "Deep Center Field": (5, 5),
    "Center Field": (4, 4),
    "Infield": (1, 1),
    "Deep Right Field": (5, 1),
    "into play": None,
    "Deep Left Field": (1, 5),
}


In [6]:
fielding_locs = fielding_raw.rename(columns={"fielder": "Name"}).merge(
    players, on="Name"
)
fielding_locs = fielding_locs[fielding_locs["hit_loc"] != "into play"]
fielding_locs["centroid"] = fielding_locs["hit_loc"].map(centroids)


def distance_manhattan(row):
    return abs(row["X"] - row["centroid"][0]) + abs(row["Y"] - row["centroid"][1])


def distance_euclidean(row):
    return (
        (row["X"] - row["centroid"][0]) ** 2 + (row["Y"] - row["centroid"][1]) ** 2
    ) ** 0.5


fielding_locs["play_dist manhattan"] = fielding_locs.apply(distance_manhattan, axis=1)
fielding_locs["play_dist euclidean"] = fielding_locs.apply(distance_euclidean, axis=1)


In [7]:
weird_fielding_locs = fielding_locs.apply(
    lambda d: (d["X"] in [3, 4]) and (d["Y"] == 0) and (d["hit_loc"] == "Left Field"),
    axis=1,
)


In [8]:
hit_by_distance_both = (
    fielding_locs[~weird_fielding_locs]
    .groupby(["play_dist manhattan", "play_dist euclidean"])["hit"]
    .sum()
    / fielding_locs[~weird_fielding_locs]
    .groupby(["play_dist manhattan", "play_dist euclidean"])["hit"]
    .size()
)
hit_by_distance_both = hit_by_distance_both.to_frame()
hit_by_distance_both["count_hits"] = (
    fielding_locs[~weird_fielding_locs]
    .groupby(["play_dist manhattan", "play_dist euclidean"])["hit"]
    .sum()
)
hit_by_distance_both["count_total"] = (
    fielding_locs[~weird_fielding_locs]
    .groupby(["play_dist manhattan", "play_dist euclidean"])["hit"]
    .size()
)
print("Euclidean vs. Manhattan Distance Comparison")
print(hit_by_distance_both.reset_index().round(2))


Euclidean vs. Manhattan Distance Comparison
    play_dist manhattan  play_dist euclidean   hit  count_hits  count_total
0                     0                 0.00  0.20         589         2965
1                     1                 1.00  0.20        1840         9121
2                     2                 1.41  0.23        1419         6262
3                     2                 2.00  0.26         681         2577
4                     3                 2.24  0.30        1222         4043
5                     3                 3.00  0.32         649         2036
6                     4                 2.83  0.31         283          914
7                     4                 3.16  0.31         919         2950
8                     4                 4.00  0.31         399         1271
9                     5                 3.61  0.33         389         1191
10                    5                 4.12  0.33         593         1795
11                    5                 5.00

In [9]:
hbd_all = pd.DataFrame()
hit_by_distance_all = (
    fielding_locs.groupby(["play_dist manhattan"])["hit"].sum()
    / fielding_locs.groupby(["play_dist manhattan"])["hit"].size()
)
hbd_all["manhattan_hit"] = hit_by_distance_all

hit_by_distance_all_no_weirds = (
    fielding_locs[~weird_fielding_locs].groupby(["play_dist manhattan"])["hit"].sum()
    / fielding_locs[~weird_fielding_locs].groupby(["play_dist manhattan"])["hit"].size()
)
hbd_all["manhattan_no_weirds_hit"] = hit_by_distance_all_no_weirds
hbd_all = (
    hbd_all.unstack(level=1)
    .reset_index()
    .rename(columns={"level_0": "dataset", 0: "hit %"})
)
hbd_all


Unnamed: 0,dataset,play_dist manhattan,hit %
0,manhattan_hit,0,0.198651
1,manhattan_hit,1,0.201732
2,manhattan_hit,2,0.237583
3,manhattan_hit,3,0.307781
4,manhattan_hit,4,0.311782
5,manhattan_hit,5,0.324102
6,manhattan_hit,6,0.304172
7,manhattan_hit,7,0.273429
8,manhattan_hit,8,0.379386
9,manhattan_hit,9,0.382609


In [10]:
c1 = (
    alt.Chart(hbd_all, title="% Hit by Play Distance")
    .mark_circle()
    .encode(x="play_dist manhattan", y="hit %", color="dataset")
)
display(c1)


  for col_name, dtype in df.dtypes.iteritems():


In [11]:
hit_by_distance_loc = (
    fielding_locs.groupby(["hit_loc", "play_dist manhattan"])["hit"].sum()
    / fielding_locs.groupby(["hit_loc", "play_dist manhattan"])["hit"].size()
)
hit_by_distance_loc.to_frame().reset_index().tail(20)


Unnamed: 0,hit_loc,play_dist manhattan,hit
58,Right Field,0,0.178182
59,Right Field,1,0.201268
60,Right Field,2,0.24781
61,Right Field,3,0.323359
62,Right Field,4,0.294201
63,Right Field,5,0.321534
64,Right Field,6,0.428571
65,Right Field,7,0.310811
66,Right Field,8,0.258621
67,The Wall,0,0.194805


In [12]:
hbd_loc = hit_by_distance_loc.to_frame().reset_index()


In [13]:
alt.Chart(hbd_loc, title=r"Hit % by play_distance").mark_circle().encode(
    x="play_dist manhattan", y="hit", color="hit_loc"
)


  for col_name, dtype in df.dtypes.iteritems():


## Regression town

Let's figure out the relationship between play distance and any relevant player attributes.

**We are going to use the fielding data _without_ the weird fielding locations.**

In [14]:
print("N Fielding Records", len(fielding_locs))
print("N Fielding Records (No Weird)", len(fielding_locs[~weird_fielding_locs]))


N Fielding Records 40057
N Fielding Records (No Weird) 38769


In [15]:
attributes_with_fielding = (
    fielding_locs[~weird_fielding_locs]
    .merge(attributes_raw, on="Name")
    .rename(columns={"play_dist manhattan": "play_dist"})
)


In [16]:
player_attributes = [
    "sight",
    "thwack",
    "fero",
    "control",
    "stuff",
    "guile",
    "reach",
    "magnet",
    "reflex",
    "hustle",
    "stealth",
    "dodge",
    "thrive",
    "survive",
    "drama",
]


In [17]:
model_stats = []
for attr in player_attributes:
    l = smf.logit(
        formula=f"hit ~ play_dist*{attr}", data=attributes_with_fielding
    ).fit()
    print("---", attr, "---")
    print(l.summary2())
    model_stats.append(
        {
            "attr": attr,
            "aic": l.aic,
            "bic": l.bic,
            "prsquared": l.prsquared,
            "pval_interaction": l.pvalues[f"play_dist:{attr}"],
        }
    )


Optimization terminated successfully.
         Current function value: 0.570754
         Iterations 5
--- sight ---
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.012      
Dependent Variable: hit              AIC:              43840.7954 
Date:               2023-01-15 14:18 BIC:              43875.0186 
No. Observations:   38399            Log-Likelihood:   -21916.    
Df Model:           3                LL-Null:          -22176.    
Df Residuals:       38395            LLR p-value:      3.9026e-112
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
------------------------------------------------------------------
                   Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
Intercept         -1.3771   0.0479 -28.7485 0.0000 -1.4710 -1.2832
play_dist          0.1247   0.0133   9.

In [18]:
pd.DataFrame(model_stats).sort_values("pval_interaction")


Unnamed: 0,attr,aic,bic,prsquared,pval_interaction
6,reach,43823.867493,43858.09064,0.01208,3.5e-05
7,magnet,43141.121729,43175.344875,0.027474,0.002726
8,reflex,43835.484182,43869.707329,0.011819,0.009863
1,thwack,43832.465606,43866.688753,0.011887,0.031729
14,drama,43820.993711,43855.216857,0.012145,0.068851
11,dodge,43839.516933,43873.74008,0.011728,0.139791
0,sight,43840.795439,43875.018585,0.011699,0.258548
2,fero,43837.064509,43871.287656,0.011783,0.48647
10,stealth,43835.751936,43869.975083,0.011812,0.521318
3,control,43838.322242,43872.545388,0.011755,0.526349


**UGH, all the defensive stats have significant interactive terms (and `thwack`). Are they all important?**

In [19]:
model = [f"play_dist*{var}" for var in player_attributes]
model_str = " + ".join(model)


In [20]:
l = smf.logit(formula=f"hit ~ {model_str}", data=attributes_with_fielding).fit()
print(l.summary2())


Optimization terminated successfully.
         Current function value: 0.560817
         Iterations 5
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.029      
Dependent Variable: hit              AIC:              43133.5946 
Date:               2023-01-15 14:18 BIC:              43407.3798 
No. Observations:   38399            Log-Likelihood:   -21535.    
Df Model:           31               LL-Null:          -22176.    
Df Residuals:       38367            LLR p-value:      6.1507e-250
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
------------------------------------------------------------------
                   Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
Intercept         -1.1239   0.1529  -7.3481 0.0000 -1.4236 -0.8241
play_dist          0.2343   0.0453   5.1729 0.0000  0

All the defense stats are important and have significant interaction terms! Let's model with only them to reduce the noise of the full model and see how that model compares.

In [21]:
l = smf.logit(
    formula=f"hit ~ play_dist*reach + play_dist*magnet + play_dist*reflex",
    data=attributes_with_fielding,
).fit()
print(l.summary2())


Optimization terminated successfully.
         Current function value: 0.561313
         Iterations 5
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.028      
Dependent Variable: hit              AIC:              43123.6938 
Date:               2023-01-15 14:18 BIC:              43192.1401 
No. Observations:   38399            Log-Likelihood:   -21554.    
Df Model:           7                LL-Null:          -22176.    
Df Residuals:       38391            LLR p-value:      2.1971e-264
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
------------------------------------------------------------------
                   Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
Intercept         -1.2464   0.0718 -17.3651 0.0000 -1.3871 -1.1057
play_dist          0.2600   0.0215  12.1170 0.0000  0

So we don't lose a lot of Pseudo R^2 and the AIC and BIC values are lower, so let's say this model is 'better'.

Let's model the `main effects` only to see if these interaction terms are even important.

In [22]:
l = smf.logit(
    formula=f"hit ~ play_dist + reach + magnet + reflex",
    data=attributes_with_fielding,
).fit()
print(l.summary2())


Optimization terminated successfully.
         Current function value: 0.561751
         Iterations 5
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.027      
Dependent Variable: hit              AIC:              43151.3240 
Date:               2023-01-15 14:18 BIC:              43194.1029 
No. Observations:   38399            Log-Likelihood:   -21571.    
Df Model:           4                LL-Null:          -22176.    
Df Residuals:       38394            LLR p-value:      9.1734e-261
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
-------------------------------------------------------------------
               Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
Intercept     -0.9101    0.0416  -21.8530  0.0000  -0.9917  -0.8285
play_dist      0.1412    0.0061   23.0238  0.0000

This is worse: we go down in Psuedo R^2 and up in AIC and BIC, so the interaction model is probably closer to reality.

### More Weird Plays Analysis

Let's see if the false positive on `hustle` is because the players who did `weird plays` all have a significant hustle stat.

In [23]:
attributes_with_fielding_weird = (
    fielding_locs[weird_fielding_locs]
    .merge(attributes_raw, on="Name")
    .rename(columns={"play_dist manhattan": "play_dist"})
)


In [24]:
attributes_with_fielding_weird.drop_duplicates("Name")[player_attributes].describe().T.sort_values("mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hustle,18.0,0.629444,0.312136,0.08,0.315,0.69,0.9125,1.0
survive,18.0,0.599444,0.280891,0.07,0.43,0.595,0.875,0.96
thwack,18.0,0.57,0.254003,0.05,0.41,0.615,0.74,0.97
fero,18.0,0.545556,0.267213,0.12,0.305,0.535,0.795,0.92
drama,18.0,0.533333,0.226845,0.26,0.3675,0.44,0.735,0.99
control,18.0,0.52,0.316581,0.02,0.2525,0.535,0.8025,0.99
thrive,18.0,0.495556,0.316418,0.05,0.185,0.545,0.735,0.98
magnet,18.0,0.486667,0.301545,0.09,0.2175,0.53,0.6325,1.0
sight,18.0,0.483889,0.245312,0.11,0.265,0.475,0.6825,0.9
reach,18.0,0.468889,0.252817,0.07,0.28,0.465,0.685,0.96


Well, hot darn. Players who made weird plays have a mean `hustle` of `.62`, so that would have explained the previous relationship we saw with hustle being significant in the interaction with `play_dist`.