In [55]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [56]:
START = 2003
END = 2023

In [57]:
batting = batting_stats(START, END, qual = 200)

In [58]:
batting.to_csv("batting.csv")

In [59]:
batting = batting.groupby("IDfg", group_keys = False).filter(lambda x: x.shape[0] > 1)

In [60]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
5,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.3
12,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
1,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
73,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,0.0,,0,0.200,0.266,,,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6538,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.4
6899,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0.0,,0,0.166,0.252,,,,-2.4
7056,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
6683,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.170,0.295,,,,-2.9


In [61]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys = False).apply(next_season)

In [62]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5550,Alfredo Amezaga,2006,1.1,2.0
4991,Alfredo Amezaga,2007,2.0,1.2
5234,Alfredo Amezaga,2008,1.2,
828,Garret Anderson,2003,5.1,0.8
2559,Garret Anderson,2004,0.8,-0.2
...,...,...,...,...
3093,Ha-seong Kim,2023,4.4,
1078,Vinnie Pasquantino,2022,1.5,0.0
3326,Vinnie Pasquantino,2023,0.0,
2808,Seiya Suzuki,2022,2.1,3.2


In [63]:
null_count = batting.isnull().sum()

In [64]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6775
xSLG        6775
xwOBA       6775
L-WAR          0
Next_WAR    1202
Length: 321, dtype: int64

In [65]:
complete_cols = list(batting.columns[null_count == 0])

In [66]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K

In [67]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [68]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
5550,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
4991,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5234,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
828,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
2559,2,2004,Garret Anderson,ANA,32,112,442,475,133,98,...,103,75,78,106,98,0,0.176,0.270,0.8,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,27506,2023,Ha-seong Kim,SDP,27,152,538,626,140,100,...,99,85,117,106,82,424,0.233,0.293,4.1,
1078,27676,2022,Vinnie Pasquantino,KCR,24,72,258,298,76,56,...,104,87,79,93,123,226,0.162,0.228,1.5,0.0
3326,27676,2023,Vinnie Pasquantino,KCR,25,61,231,260,57,31,...,102,88,87,91,121,202,0.179,0.253,0.2,
2808,30116,2022,Seiya Suzuki,CHC,27,111,397,446,104,66,...,105,130,106,95,106,290,0.232,0.312,2.0,3.2


In [69]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [71]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [72]:
batting["Dol"]

5550     $5.5
4991    $11.2
5234     $7.2
828     $22.0
2559     $3.4
        ...  
3093    $35.0
1078    $11.7
3326     $0.4
2808    $16.8
1360    $25.4
Name: Dol, Length: 6775, dtype: object

In [73]:
del batting["Dol"]