In [1]:
import pandas as pd
import altair as alt


In [2]:
# Player Attributes Sheet from https://docs.google.com/spreadsheets/d/1CnPsdnZc4lFObYpu6R5lEYzRLmIBh-I03JFsTX6AaXE/edit#gid=108546025
# fielding data from SIBR discord

fielding_raw = pd.read_csv("../data/fielding-thru-game-72.csv").assign(
    fielder=lambda d: d["fielder"]
    .str.replace("&#x27;", "'")
    .str.replace("Jr", "Jr.")
    .str.replace("al-", "Al-")
    .str.replace("O'c", "O'C")
)
attributes_raw = pd.read_csv(
    "../data/Player Attributes Sheet - All Players.csv", header=1
)
# last 13 rows are aggregates
attributes_raw = attributes_raw.drop(attributes_raw.tail(13).index, axis="rows")
# pitchers don't field
attributes_raw = attributes_raw.query("Position != 'Pitcher'")


In [3]:
dtypes = {
    "Rating": float,
    "sight": float,
    "thwack": float,
    "fero": float,
    "Batting": float,
    "control": float,
    "stuff": float,
    "guile": float,
    "Pitching": float,
    "reach": float,
    "magnet": float,
    "reflex": float,
    "Defense": float,
    "hustle": float,
    "stealth": float,
    "dodge": float,
    "Running": float,
    "thrive": float,
    "survive": float,
    "drama": float,
    "Vibes": float,
}

attributes_raw = attributes_raw.astype(dtypes)


In [4]:
fielding_raw.head()


Unnamed: 0,game_id,start_disp_order,fielder_team,hit_loc,fielder,pa,hit,totl_outs,runs,advances,sng,dbl,trp,hr,fc,dp,tp,sac
0,3ca6a0d3-65a3-4f2e-9586-8eda28fc6511,1,Ohio Worms,The Wall,Malin Hsu,1,0,1,0,0,0,0,0,0,0,0,0,0
1,3ca6a0d3-65a3-4f2e-9586-8eda28fc6511,12,Ohio Worms,Right Field,Jorge Kahvegian,1,0,1,0,0,0,0,0,0,0,0,0,0
2,3ca6a0d3-65a3-4f2e-9586-8eda28fc6511,39,Ohio Worms,Left Field,Archie Yanez,1,0,1,0,0,0,0,0,0,0,0,0,0
3,3ca6a0d3-65a3-4f2e-9586-8eda28fc6511,55,Breckenridge Jazz Hands,Deep Center Field,Sigmund Castillo,1,0,1,0,0,0,0,0,0,0,0,0,0
4,3ca6a0d3-65a3-4f2e-9586-8eda28fc6511,76,Breckenridge Jazz Hands,Center Field,Erickson Sato,1,0,1,0,0,0,0,0,0,0,0,0,0


In [5]:
players_from_attributes = set(attributes_raw["Name"].unique())
players_from_fielding = set(fielding_raw["fielder"].unique())


print("A not F:", players_from_attributes - players_from_fielding)
print("F not A:", players_from_fielding - players_from_attributes)
# Still too new to know what would cause a difference here, so we'll just exclude from our analysis


A not F: set()
F not A: {'Malik Destiny', 'Siobhan Chark', 'Badgerson Stromboli', 'Mindy Salad', 'Natha Spruce', 'Bees Gorczyca'}


In [6]:
fielded_per_player = fielding_raw.groupby("fielder").size()
fielded_per_player.sort_values(ascending=False).head()


fielder
Malin Hsu             550
Penelope Berkowitz    484
Archie Yanez          480
Abbott Wright         451
Tad Seeth             441
dtype: int64

In [7]:
hit_against_fielder = fielding_raw.groupby("fielder")["hit"].sum() / fielded_per_player
hit_against_fielder.sort_values(ascending=False)


fielder
Serge Shortvat    0.431373
Atma Blueberry    0.420455
Katy Hermoso      0.413580
Archie Yanez      0.404167
Roscoe Sundae     0.383459
                    ...   
Sheev Shriffle    0.142857
Adrian Melon      0.141379
Zuri Shoelace     0.140625
Nori Bluegrass    0.135922
Lorcan Smaht      0.134921
Length: 223, dtype: float64

In [8]:
fielded_with_attributes = (
    fielded_per_player.reset_index()
    .rename(columns={"fielder": "Name", 0: "N Fielded"})
    .merge(attributes_raw, on="Name")
)
fielded_with_attributes = (
    hit_against_fielder.reset_index()
    .rename(columns={"fielder": "Name", 0: "% Hit Against"})
    .merge(fielded_with_attributes, on="Name")
)


In [9]:
fielded_with_attributes.corr(numeric_only=True)["N Fielded"].sort_values(
    ascending=False
)


N Fielded        1.000000
reach            0.777895
Defense          0.453454
Rating           0.258424
hustle           0.132615
Running          0.123257
dodge            0.094184
fero             0.091512
control          0.074287
magnet           0.065512
Vibes            0.054530
thrive           0.049038
Batting          0.037392
% Hit Against    0.034091
drama            0.026587
sight            0.018842
survive          0.016817
Pitching        -0.002780
stealth         -0.004045
reflex          -0.013254
stuff           -0.025225
thwack          -0.045287
guile           -0.049705
Rank            -0.384111
Name: N Fielded, dtype: float64

In [10]:
corr = fielded_with_attributes.corr(numeric_only=True)["N Fielded"]["reach"]

selection = alt.selection_multi(fields=["Team"], bind="legend")

chart_standard = (
    alt.Chart(fielded_with_attributes, title=f"Reach vs. N Fielded (ρ={corr:.3f})")
    .mark_circle()
    .encode(
        x="reach",
        y="N Fielded",
        tooltip=["Name", "reach", "N Fielded", "Team"],
        color="Team",
    )
).add_selection(selection)

chart_standard


  for col_name, dtype in df.dtypes.iteritems():


In [11]:
chart_log = (
    alt.Chart(fielded_with_attributes, title="Reach vs. N Fielded (log)")
    .mark_circle()
    .encode(
        x="reach",
        y=alt.Y("N Fielded", scale=alt.Scale(type="log")),
        tooltip=["Name", "reach", "N Fielded", "Team"],
        color="Team",
    )
).add_selection(selection)
chart_log


In [12]:
fielded_with_attributes["% of Team N Fielded"] = (
    fielded_with_attributes["N Fielded"]
    / fielded_with_attributes.groupby("Team")["N Fielded"].transform("sum")
) + 0.0001
fielded_with_attributes["% of Team Total Reach"] = (
    fielded_with_attributes["reach"]
    / fielded_with_attributes.groupby("Team")["reach"].transform("sum")
) + 0.0001


In [13]:
corr = fielded_with_attributes.corr(numeric_only=True)["% of Team Total Reach"][
    "% of Team N Fielded"
]

chart_standard = (
    alt.Chart(
        fielded_with_attributes,
        title=f"% Team Reach vs. % Team N Fielded (ρ={corr:.3f})",
    )
    .mark_circle()
    .encode(
        x=r"% of Team N Fielded",
        y=r"% of Team Total Reach",
        tooltip=["Name", r"% of Team N Fielded", r"% of Team Total Reach", "Team"],
        color="Team",
    )
).add_selection(selection)

chart_standard


  for col_name, dtype in df.dtypes.iteritems():


In [14]:
fielded_with_attributes.corr(numeric_only=True)["% Hit Against"].sort_values(
    ascending=False
)


% Hit Against            1.000000
Rank                     0.291180
Running                  0.069075
stuff                    0.066845
hustle                   0.064179
drama                    0.059225
thwack                   0.056643
% of Team Total Reach    0.055558
stealth                  0.036408
N Fielded                0.034091
dodge                    0.030599
reach                    0.026864
control                  0.018716
Batting                  0.009466
% of Team N Fielded      0.006346
reflex                   0.000241
fero                    -0.007403
Vibes                   -0.008658
Pitching                -0.020625
thrive                  -0.027602
sight                   -0.031130
survive                 -0.048295
guile                   -0.107614
Rating                  -0.188759
Defense                 -0.462355
magnet                  -0.850706
Name: % Hit Against, dtype: float64

In [15]:
corr = fielded_with_attributes.corr(numeric_only=True)["% Hit Against"]["magnet"]
chart_standard = (
    alt.Chart(fielded_with_attributes, title=f"Magnet vs. % Hit Against (ρ={corr:.3f})")
    .mark_circle()
    .encode(
        x="magnet",
        y="% Hit Against",
        tooltip=["Name", "magnet", "% Hit Against", "Team"],
        color="Team",
    )
).add_selection(selection)
chart_standard
