In [57]:
import numpy as np
import pandas as pd

In [58]:
df = pd.read_csv("data/inningHighlights.csv")

In [59]:
df = df[df["Inning"].isin(["T1", "B1"])]

In [60]:
df = df.groupby("Game")['Runs'].sum().reset_index()

In [61]:
df['Runs'] = df['Runs'] < 1

In [62]:
df.rename(columns={'Runs': 'NRFI'}, inplace=True)

In [63]:
games = pd.read_csv('data/games.csv')

Noticed 16 duplicate games

In [64]:
games = games.drop_duplicates().reset_index()

In [65]:
df = pd.merge(df, games, on='Game', how='inner')

In [66]:
df = df[['Game','NRFI','away','home','Date']]

In [67]:
df['year'] = pd.to_datetime(df['Date']).dt.year
df = df.drop(columns="Date")

In [68]:
df.head()

Unnamed: 0,Game,NRFI,away,home,year
0,360403107,False,NYM,KC,2016
1,360403123,True,STL,PIT,2016
2,360403130,False,TOR,TB,2016
3,360404101,True,MIN,BAL,2016
4,360404103,False,CHC,LAA,2016


In [69]:
pitchers = pd.read_csv('data/pitchersByGame.csv')

  pitchers = pd.read_csv('data/pitchersByGame.csv')


Noticed duplicates in pitchers by game

In [70]:
pitchers = pitchers[["Pitchers", "Pitcher Id", 'Game', 'Team']]

In [71]:
team_rows = pitchers['Pitchers'] == 'TEAM'

starting_pitcher_indices = pitchers.index[team_rows] +  1

starting_pitcher_indices = starting_pitcher_indices.insert(0, 0)

starting_pitcher_indices = starting_pitcher_indices[starting_pitcher_indices < len(pitchers)]

starting_pitchers = pitchers.loc[starting_pitcher_indices].reset_index(drop=True)

In [72]:
starting_pitchers = starting_pitchers.drop_duplicates().reset_index()

Standardize names such as R.A. Dickey<br/>
Easier to take it out then deal with it and names like Michael A. Taylor at the same time<br/>
Also have to take all suffixes out now because pitchersByGame is inconsistent in using them or not

* The reason we have to merge on pitcher names and thus do all this is because the player ids come<br/>
from different data sets and do not match

In [73]:
def standardize_first_name(name):
    first, last = name.split(". ")
    return first[0] + ". " + last

suffixes_to_remove = {'Jr.', 'Sr.', 'II', 'III', 'IV', 'V'}

def remove_suffix(name):
    first, last_part = name.split(". ")
    
    last_parts = last_part.split(" ")
    
    if last_parts[-1] in suffixes_to_remove:
        last_name = " ".join(last_parts[:-1])
    else:
        last_name = last_part

    first_initial = first[0] + "."
    return f"{first_initial} {last_name}"
    

starting_pitchers['Pitchers'] = starting_pitchers["Pitchers"].apply(standardize_first_name)
starting_pitchers['Pitchers'] = starting_pitchers["Pitchers"].apply(remove_suffix)

In [74]:
pitchers_df = starting_pitchers.rename(columns={
    'Pitchers': 'pitcher_name',
    'Pitcher Id': 'pitcher_id',
    'Game': 'game_id',
    'Team': 'team'
})

df = df.rename(columns={
    'Game': 'game_id',
    'away': 'away_team',
    'home': 'home_team'
})


# Merge away pitcher
merged = df.merge(
    pitchers_df,
    left_on=['game_id', 'away_team'],
    right_on=['game_id', 'team'],
    how='left'
).rename(columns={
    'pitcher_name': 'away_pitcher',
    'pitcher_id': 'away_pitcher_id'
}).drop(columns=['team'])

# Merge home pitcher
merged = merged.merge(
    pitchers_df,
    left_on=['game_id', 'home_team'],
    right_on=['game_id', 'team'],
    how='left'
).rename(columns={
    'pitcher_name': 'home_pitcher',
    'pitcher_id': 'home_pitcher_id'
}).drop(columns=['team'])

#Reorder columns for clarity
merged = merged[[
    'game_id', 'NRFI',
    'away_team', 'away_pitcher', 'away_pitcher_id',
    'home_team', 'home_pitcher', 'home_pitcher_id','year'
]]

df = merged


In [75]:
df.head()

Unnamed: 0,game_id,NRFI,away_team,away_pitcher,away_pitcher_id,home_team,home_pitcher,home_pitcher_id,year
0,360403107,False,NYM,M. Harvey,31214,KC,E. Volquez,6401,2016
1,360403123,True,STL,A. Wainwright,5403,PIT,F. Liriano,6211,2016
2,360403130,False,TOR,M. Stroman,32815,TB,C. Archer,31003,2016
3,360404101,True,MIN,E. Santana,6280,BAL,C. Tillman,30285,2016
4,360404103,False,CHC,J. Arrieta,30145,LAA,G. Richards,30892,2016


In [76]:
pitcher_stats = pd.read_csv("data/no_filter_pitcher_stats.csv")

In [77]:
import unicodedata

def standardize_last_name(name):
    last_part, first = name.split(", ")
    
    last_parts = last_part.split(" ")
    
    if last_parts[-1] in suffixes_to_remove:
        last_name = " ".join(last_parts[:-1])
    else:
        last_name = last_part

    first_initial = first[0] + "."
    return f"{first_initial} {last_name}"

def remove_accents(text):
    if isinstance(text, str):
        return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    return text

pitcher_stats["last_name, first_name"] = pitcher_stats["last_name, first_name"].apply(standardize_last_name)

pitcher_stats.rename(columns={"last_name, first_name" : "pitcher_name"}, inplace=True)

pitcher_stats['pitcher_name'] = pitcher_stats['pitcher_name'].apply(remove_accents)

In [78]:
home_stats = pitcher_stats.rename(columns=lambda col: f'home_pitcher_{col}' if col not in ['pitcher_name', 'year'] else col)
away_stats = pitcher_stats.rename(columns=lambda col: f'away_pitcher_{col}' if col not in ['pitcher_name', 'year'] else col)

df = df.merge(
    home_stats,
    left_on=['home_pitcher', 'year'],
    right_on=['pitcher_name', 'year'],
    how='left'
).drop(columns=['pitcher_name','home_pitcher_id','home_pitcher_player_id'])

df = df.merge(
    away_stats,
    left_on=['away_pitcher', 'year'],
    right_on=['pitcher_name', 'year'],
    how='left'
).drop(columns=['pitcher_name','away_pitcher_id','away_pitcher_player_id'])

More drop duplicates (stay at original number of 13379)

In [79]:
df = df.drop_duplicates(subset='game_id').reset_index()

In [80]:
df.shape

(13379, 32)

In [81]:
rows_with_nan = df[df.isnull().any(axis=1)]
rows_with_nan

Unnamed: 0,index,game_id,NRFI,away_team,away_pitcher,home_team,home_pitcher,year,home_pitcher_pa,home_pitcher_k_percent,home_pitcher_bb_percent,home_pitcher_woba,home_pitcher_xwoba,home_pitcher_sweet_spot_percent,home_pitcher_barrel_batted_rate,home_pitcher_hard_hit_percent,home_pitcher_avg_best_speed,home_pitcher_avg_hyper_speed,home_pitcher_whiff_percent,home_pitcher_swing_percent,away_pitcher_pa,away_pitcher_k_percent,away_pitcher_bb_percent,away_pitcher_woba,away_pitcher_xwoba,away_pitcher_sweet_spot_percent,away_pitcher_barrel_batted_rate,away_pitcher_hard_hit_percent,away_pitcher_avg_best_speed,away_pitcher_avg_hyper_speed,away_pitcher_whiff_percent,away_pitcher_swing_percent
7532,8089,401074871,True,NYY,D. German,BAL,H. Alberto,2019,,,,,,,,,,,,,594.0,25.8,6.6,0.305,0.316,37.8,8.8,40.1,78.97498,94.057959,28.2,49.5
7693,8262,401075039,True,TOR,M. Shoemaker,OAK,K. Morales,2019,,,,,,,,,,,,,108.0,22.2,8.3,0.238,0.341,27.0,10.8,39.2,79.266015,93.529085,28.5,49.5
7761,8336,401075108,False,TEX,T. Hearn,SEA,M. Gonzales,2019,866.0,17.0,6.5,0.311,0.311,31.5,4.9,35.2,77.912446,93.493739,18.0,47.7,,,,,,,,,,,,
7848,8431,401075199,True,SD,M. Strahm,ATL,C. Culberson,2019,,,,,,,,,,,,,487.0,24.2,4.5,0.326,0.324,35.6,9.7,40.0,78.363867,94.014009,23.3,50.8
7850,8433,401075201,False,BOS,C. Sale,CHW,J. Rondon,2019,,,,,,,,,,,,,612.0,35.6,6.0,0.294,0.284,34.9,8.1,36.3,78.006804,93.662112,32.0,47.4
7893,8480,401075245,False,ARI,J. Murphy,TB,B. Snell,2019,441.0,33.3,9.1,0.301,0.265,31.2,4.7,32.0,78.948419,92.822174,38.4,48.9,,,,,,,,,,,,
9781,10518,401077156,True,SD,M. Baez,ARI,T. Walker,2019,,,,,,,,,,,,,131.0,21.4,10.7,0.295,0.299,34.9,4.7,37.2,77.371047,93.095491,24.0,48.8
9790,10528,401078849,True,ARI,T. Widener,CHW,E. Santana,2019,64.0,7.8,9.4,0.471,0.466,34.0,20.8,39.6,78.485362,94.223256,13.0,46.0,,,,,,,,,,,,
9798,10539,401078857,False,LAA,D. Peters,SEA,T. Danish,2019,,,,,,,,,,,,,327.0,16.8,8.0,0.381,0.361,39.4,8.7,41.1,78.527388,94.370654,20.4,46.9
9803,10544,401078862,False,LAA,J. Beasley,SD,J. Lucchesi,2019,686.0,23.0,8.2,0.298,0.297,30.9,7.2,36.4,77.623735,93.512161,25.4,45.3,,,,,,,,,,,,


26 rows with names not found in our pitcher stats, lets remove these rows with nan values

In [82]:
df = df.dropna()

In [83]:
df.shape

(13353, 32)

New magic number is 13353

In [84]:
df['home_pitcher_low_sample'] = df['home_pitcher_pa'] < 100
df['away_pitcher_low_sample'] = df['away_pitcher_pa'] < 100

Plan is to add weighted stats here at some point

In [85]:
batters_df = pd.read_csv("data/hittersByGame.csv")

  batters_df = pd.read_csv("data/hittersByGame.csv")


In [86]:
batters_df = batters_df[["Hitters", "Game", "Team","Position","AB","BB"]]
batters_df.rename(columns={"Hitters" : "batter_name", "Game" : "game_id", }, inplace=True)

In [87]:
batters_df.head()

Unnamed: 0,batter_name,game_id,Team,Position,AB,BB
0,M. Carpenter,360403123,STL,3B,4,1
1,T. Pham,360403123,STL,LF,1,0
2,M. Adams,360403123,STL,1B,4,0
3,M. Holliday,360403123,STL,1B-LF,3,1
4,R. Grichuk,360403123,STL,CF,4,0


pitchers snuck in the top 5, drop them before fetching

In [88]:
batters_df = batters_df[batters_df['Position'] != 'P']

## Keep top 5 batters in the order
- top 5 is infiltrated by pinch hitters, must weed them out
- don't have the data to calculate total AB's so cant use that (for example McCutchen bats 3rd but was left off because he had one less AB + BB compared to players after him, likely from a sac fly)
- also tommy pham bat 2nd but was subbed out after one AB so would be left off
- no way of being totally confident on this

- lets just drop all guys with <= 1 AB + BB as that would be unlikely they were in the starting order

In [89]:
batters_df['AB'] = pd.to_numeric(batters_df['AB'], errors='coerce')
batters_df['BB'] = pd.to_numeric(batters_df['BB'], errors='coerce')

batters_df = batters_df[batters_df['AB'] + batters_df['BB'] > 1]


In [90]:
# test
batters_df['batter_order'] = batters_df.groupby(['game_id', 'Team']).cumcount() + 1
stl = batters_df[batters_df["Team"] == "STL"]
stl.head()

Unnamed: 0,batter_name,game_id,Team,Position,AB,BB,batter_order
0,M. Carpenter,360403123,STL,3B,4.0,1.0,1
2,M. Adams,360403123,STL,1B,4.0,0.0,2
3,M. Holliday,360403123,STL,1B-LF,3.0,1.0,3
4,R. Grichuk,360403123,STL,CF,4.0,0.0,4
5,S. Piscotty,360403123,STL,RF,3.0,1.0,5


In [91]:
batters_df = batters_df[batters_df["batter_name"] != "TEAM"]
batters_df['batter_name'] = batters_df["batter_name"].apply(standardize_first_name)
batters_df['batter_name'] = batters_df["batter_name"].apply(remove_suffix)

# have to remove accent too, this dataset is so inconsistent
batters_df['batter_name'] = batters_df['batter_name'].apply(remove_accents)

In [92]:
top_n = 5
top_batters = batters_df[batters_df['batter_order'] <= top_n]

batters_df = top_batters.pivot(
    index=['game_id', 'Team'],
    columns='batter_order',
    values='batter_name'
).reset_index()

In [93]:
batters_df.columns = ['game_id', 'Team'] + [f'batter_{i}' for i in range(1, top_n + 1)]

In [94]:
pd.set_option('display.max_columns', None)
# Merge home batters
df = df.merge(
    batters_df,
    left_on=['game_id', 'home_team'],
    right_on=['game_id', 'Team'],
    how='left',
    suffixes=('', '_home')
).drop(columns=['Team'])

df.rename(columns={f'batter_{i}': f'home_batter_{i}' for i in range(1, top_n + 1)}, inplace=True)

df = df.merge(
    batters_df,
    left_on=['game_id', 'away_team'],
    right_on=['game_id', 'Team'],
    how='left',
    suffixes=('', '_away')
).drop(columns=['Team'])

# Rename batter columns for away team
df.rename(columns={f'batter_{i}': f'away_batter_{i}' for i in range(1, top_n + 1)}, inplace=True)

going to keep default baseball savant stats for now, can experiment with more stats later <br/>
easy ones like OPS, OBP should be included later for sure

In [95]:
batter_stats = pd.read_csv("data/batter_stats.csv")

savant puts both pitcher and batter data in this csv for some reason, when we merge only batters will remain

In [96]:
batter_stats["last_name, first_name"] = batter_stats["last_name, first_name"].apply(standardize_last_name)

batter_stats.rename(columns={"last_name, first_name" : "batter_name"}, inplace=True)

batter_stats['batter_name'] = batter_stats['batter_name'].apply(remove_accents)


rows_with_nan = batter_stats[batter_stats.isnull().any(axis=1)]
rows_with_nan

Unnamed: 0,batter_name,player_id,year,pa,k_percent,bb_percent,woba,xwoba,sweet_spot_percent,barrel_batted_rate,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent
656,R. Gsellman,607229,2016,17,52.9,0.0,0.059,0.046,12.5,0.0,0.0,,,17.6,29.3
3639,A. Sampson,592716,2021,11,63.6,0.0,0.0,0.0,25.0,0.0,0.0,,,39.1,54.8


In [97]:
batter_stats = batter_stats.dropna()

In [98]:
for team in ['home', 'away']:
    for i in range(1, 6):
        batter_col = f'{team}_batter_{i}'
        stats = batter_stats.rename(
            columns=lambda col: f'{batter_col}_{col}' if col not in ['batter_name', 'year'] else col
        )

        df = df.merge(
            stats,
            left_on=[batter_col, 'year'],
            right_on=['batter_name', 'year'],
            how='left'
        ).drop(columns=['batter_name'])

df = df.drop_duplicates(subset='game_id').reset_index(drop=True)

In [99]:
batter_stats_to_agg = ['xwoba', 'woba', 'k_percent', 'bb_percent', 'barrel_batted_rate','sweet_spot_percent','hard_hit_percent','avg_best_speed','avg_hyper_speed','whiff_percent','swing_percent']

for stat in batter_stats_to_agg:
    df[f'home_top5_avg_{stat}'] = df[[f'home_batter_{i}_{stat}' for i in range(1, 6)]].mean(axis=1)
    df[f'away_top5_avg_{stat}'] = df[[f'away_batter_{i}_{stat}' for i in range(1, 6)]].mean(axis=1)

### Let's think about training our data, we can add stats to optimize later

- recent performance
- specific performance (how hitters perform against specific pitchers)
- weather + stadium
- weighted stats based on sample size

### But first lets get overall team nrfi percentage as that must be a big indicator

Going to scrape teamrankings.com for the data


In [100]:
import pandas as pd
import time, random, os

# Load data if already scraped
if os.path.exists('data/first-inning-runs-per-game.csv'):
    first_inning_rpg_df = pd.read_csv('data/first-inning-runs-per-game.csv', index_col=0)
else:
    # List of seasons to scrape
    seasons = ['2016', '2017', '2018', '2019', '2020', '2021']
    first_inning_rpg_df = pd.DataFrame()

    for season in seasons:
        url = f'https://www.teamrankings.com/mlb/stat/1st-inning-runs-per-game?date={season}-11-10'
        
        try:
            tables = pd.read_html(url, header=0)
            if tables:
                season_df = tables[0]
                season_df['Season'] = season  # Add season column
                first_inning_rpg_df = pd.concat([first_inning_rpg_df, season_df], ignore_index=True)
                print(f"Scraped season {season}")
            else:
                print(f"No table found for season {season}")
        except Exception as e:
            print(f"Failed to scrape season {season}: {e}")
        
        time.sleep(random.randint(4, 6))  # Sleep to avoid rate limiting

    # Save the data
    first_inning_rpg_df.to_csv('data/first-inning-runs-per-game.csv')

In [101]:
first_inning_rpg_df.head()

Unnamed: 0,Rank,Team,2016,Last 3,Last 1,Home,Away,2015,Season,2017,2018,2019,2020,2021
0,1,Washington,0.72,0.33,0.0,0.62,0.83,0.55,2016,,,,,
1,2,Boston,0.71,0.33,0.0,0.89,0.53,0.48,2016,,,,,
2,3,Cincinnati,0.68,1.33,3.0,0.73,0.63,0.59,2016,,,,,
3,4,Chi Cubs,0.67,1.33,1.0,0.69,0.66,0.53,2016,,,,,
4,5,Arizona,0.65,2.0,0.0,0.72,0.59,0.52,2016,,,,,


In [102]:
rows = []

years = [2016, 2017, 2018, 2019, 2020, 2021]
column_indices = {2016: 2, 2017: 9, 2018: 10, 2019: 11, 2020: 12, 2021: 13}

for year in years:
    col_idx = column_indices[year]

    values = first_inning_rpg_df[first_inning_rpg_df[str(year)].notna()].iloc[:30, [1, col_idx]]
    
    for _, row in values.iterrows():
        rows.append({
            "year": year,
            "team": row.iloc[0],    
            "rpg": row.iloc[1]
        })

first_inning_rpg_df = pd.DataFrame(rows)


Going to just use overall for now, can optimize later with home / away splits

Must See what abbreviations and team names we are dealing with

- so we know how to translate
- deal with relocations (oakland to sacramento)

In [103]:
print(first_inning_rpg_df["team"].drop_duplicates().tolist())

['Washington', 'Boston', 'Cincinnati', 'Chi Cubs', 'Arizona', 'Pittsburgh', 'Toronto', 'LA Dodgers', 'Colorado', 'LA Angels', 'Seattle', 'Houston', 'Miami', 'Texas', 'Milwaukee', 'Tampa Bay', 'San Diego', 'Cleveland', 'Baltimore', 'Kansas City', 'Chi Sox', 'Detroit', 'Minnesota', 'NY Mets', 'NY Yankees', 'St. Louis', 'Philadelphia', 'Atlanta', 'SF Giants', 'Sacramento']


In [104]:
print(df["away_team"].drop_duplicates().tolist())

df[df["away_team"] == 'AL']


['NYM', 'STL', 'TOR', 'MIN', 'CHC', 'SF', 'CHW', 'SEA', 'WSH', 'PHI', 'LAD', 'COL', 'BOS', 'HOU', 'DET', 'TEX', 'MIA', 'TB', 'CLE', 'NYY', 'OAK', 'PIT', 'SD', 'BAL', 'LAA', 'CIN', 'KC', 'ATL', 'MIL', 'ARI', 'NL', 'AL']


Unnamed: 0,index,game_id,NRFI,away_team,away_pitcher,home_team,home_pitcher,year,home_pitcher_pa,home_pitcher_k_percent,home_pitcher_bb_percent,home_pitcher_woba,home_pitcher_xwoba,home_pitcher_sweet_spot_percent,home_pitcher_barrel_batted_rate,home_pitcher_hard_hit_percent,home_pitcher_avg_best_speed,home_pitcher_avg_hyper_speed,home_pitcher_whiff_percent,home_pitcher_swing_percent,away_pitcher_pa,away_pitcher_k_percent,away_pitcher_bb_percent,away_pitcher_woba,away_pitcher_xwoba,away_pitcher_sweet_spot_percent,away_pitcher_barrel_batted_rate,away_pitcher_hard_hit_percent,away_pitcher_avg_best_speed,away_pitcher_avg_hyper_speed,away_pitcher_whiff_percent,away_pitcher_swing_percent,home_pitcher_low_sample,away_pitcher_low_sample,home_batter_1,home_batter_2,home_batter_3,home_batter_4,home_batter_5,away_batter_1,away_batter_2,away_batter_3,away_batter_4,away_batter_5,home_batter_1_player_id,home_batter_1_pa,home_batter_1_k_percent,home_batter_1_bb_percent,home_batter_1_woba,home_batter_1_xwoba,home_batter_1_sweet_spot_percent,home_batter_1_barrel_batted_rate,home_batter_1_hard_hit_percent,home_batter_1_avg_best_speed,home_batter_1_avg_hyper_speed,home_batter_1_whiff_percent,home_batter_1_swing_percent,home_batter_2_player_id,home_batter_2_pa,home_batter_2_k_percent,home_batter_2_bb_percent,home_batter_2_woba,home_batter_2_xwoba,home_batter_2_sweet_spot_percent,home_batter_2_barrel_batted_rate,home_batter_2_hard_hit_percent,home_batter_2_avg_best_speed,home_batter_2_avg_hyper_speed,home_batter_2_whiff_percent,home_batter_2_swing_percent,home_batter_3_player_id,home_batter_3_pa,home_batter_3_k_percent,home_batter_3_bb_percent,home_batter_3_woba,home_batter_3_xwoba,home_batter_3_sweet_spot_percent,home_batter_3_barrel_batted_rate,home_batter_3_hard_hit_percent,home_batter_3_avg_best_speed,home_batter_3_avg_hyper_speed,home_batter_3_whiff_percent,home_batter_3_swing_percent,home_batter_4_player_id,home_batter_4_pa,home_batter_4_k_percent,home_batter_4_bb_percent,home_batter_4_woba,home_batter_4_xwoba,home_batter_4_sweet_spot_percent,home_batter_4_barrel_batted_rate,home_batter_4_hard_hit_percent,home_batter_4_avg_best_speed,home_batter_4_avg_hyper_speed,home_batter_4_whiff_percent,home_batter_4_swing_percent,home_batter_5_player_id,home_batter_5_pa,home_batter_5_k_percent,home_batter_5_bb_percent,home_batter_5_woba,home_batter_5_xwoba,home_batter_5_sweet_spot_percent,home_batter_5_barrel_batted_rate,home_batter_5_hard_hit_percent,home_batter_5_avg_best_speed,home_batter_5_avg_hyper_speed,home_batter_5_whiff_percent,home_batter_5_swing_percent,away_batter_1_player_id,away_batter_1_pa,away_batter_1_k_percent,away_batter_1_bb_percent,away_batter_1_woba,away_batter_1_xwoba,away_batter_1_sweet_spot_percent,away_batter_1_barrel_batted_rate,away_batter_1_hard_hit_percent,away_batter_1_avg_best_speed,away_batter_1_avg_hyper_speed,away_batter_1_whiff_percent,away_batter_1_swing_percent,away_batter_2_player_id,away_batter_2_pa,away_batter_2_k_percent,away_batter_2_bb_percent,away_batter_2_woba,away_batter_2_xwoba,away_batter_2_sweet_spot_percent,away_batter_2_barrel_batted_rate,away_batter_2_hard_hit_percent,away_batter_2_avg_best_speed,away_batter_2_avg_hyper_speed,away_batter_2_whiff_percent,away_batter_2_swing_percent,away_batter_3_player_id,away_batter_3_pa,away_batter_3_k_percent,away_batter_3_bb_percent,away_batter_3_woba,away_batter_3_xwoba,away_batter_3_sweet_spot_percent,away_batter_3_barrel_batted_rate,away_batter_3_hard_hit_percent,away_batter_3_avg_best_speed,away_batter_3_avg_hyper_speed,away_batter_3_whiff_percent,away_batter_3_swing_percent,away_batter_4_player_id,away_batter_4_pa,away_batter_4_k_percent,away_batter_4_bb_percent,away_batter_4_woba,away_batter_4_xwoba,away_batter_4_sweet_spot_percent,away_batter_4_barrel_batted_rate,away_batter_4_hard_hit_percent,away_batter_4_avg_best_speed,away_batter_4_avg_hyper_speed,away_batter_4_whiff_percent,away_batter_4_swing_percent,away_batter_5_player_id,away_batter_5_pa,away_batter_5_k_percent,away_batter_5_bb_percent,away_batter_5_woba,away_batter_5_xwoba,away_batter_5_sweet_spot_percent,away_batter_5_barrel_batted_rate,away_batter_5_hard_hit_percent,away_batter_5_avg_best_speed,away_batter_5_avg_hyper_speed,away_batter_5_whiff_percent,away_batter_5_swing_percent,home_top5_avg_xwoba,away_top5_avg_xwoba,home_top5_avg_woba,away_top5_avg_woba,home_top5_avg_k_percent,away_top5_avg_k_percent,home_top5_avg_bb_percent,away_top5_avg_bb_percent,home_top5_avg_barrel_batted_rate,away_top5_avg_barrel_batted_rate,home_top5_avg_sweet_spot_percent,away_top5_avg_sweet_spot_percent,home_top5_avg_hard_hit_percent,away_top5_avg_hard_hit_percent,home_top5_avg_avg_best_speed,away_top5_avg_avg_best_speed,home_top5_avg_avg_hyper_speed,away_top5_avg_avg_hyper_speed,home_top5_avg_whiff_percent,away_top5_avg_whiff_percent,home_top5_avg_swing_percent,away_top5_avg_swing_percent
3789,4038,370711132,True,AL,C. Sale,NL,M. Scherzer,2017,779.0,34.4,6.9,0.247,0.243,31.4,5.6,29.5,76.053613,92.471211,32.2,52.0,851.0,36.2,5.1,0.26,0.252,35.8,6.1,30.5,75.977416,92.794633,31.8,50.1,False,False,C. Blackmon,E. Inciarte,G. Stanton,J. Votto,B. Harper,J. Altuve,R. Cano,J. Ramirez,M. Moustakas,A. Judge,453568.0,725.0,18.6,9.0,0.414,0.372,38.1,8.7,36.4,98.88614,93.502053,19.0,46.4,542255.0,718.0,13.1,6.8,0.328,0.292,34.3,0.5,13.1,92.299377,90.267135,14.7,53.1,519317.0,692.0,23.6,12.3,0.41,0.403,31.6,17.4,45.6,105.945187,97.153916,31.8,42.9,458015.0,707.0,11.7,19.0,0.428,0.416,40.5,9.1,32.8,97.45307,92.834896,15.4,41.9,547180.0,492.0,20.1,13.8,0.416,0.394,35.5,11.4,42.6,102.523687,95.609845,26.8,49.2,514888.0,662.0,12.7,8.8,0.405,0.36,34.8,6.7,28.1,96.843291,92.421645,16.8,47.8,429664.0,648.0,13.1,7.6,0.334,0.355,33.9,6.3,44.0,101.099439,94.796904,17.8,52.0,608070.0,645.0,10.7,8.1,0.396,0.36,35.5,5.0,34.7,98.290089,93.223882,13.6,43.3,519058.0,598.0,15.7,5.7,0.345,0.338,34.0,8.6,33.8,98.852291,93.438182,20.9,55.7,592450.0,678.0,30.7,18.7,0.43,0.45,38.2,25.7,54.7,108.114253,98.934467,34.9,41.1,0.3754,0.3726,0.3992,0.382,17.42,16.58,12.18,9.78,9.42,10.46,36.0,35.28,34.1,39.06,99.421492,100.639873,93.873569,94.563016,21.54,20.8,46.7,47.98
6375,6863,380717132,True,AL,C. Sale,NL,M. Scherzer,2018,866.0,34.6,5.9,0.252,0.248,33.8,6.6,31.7,75.860881,92.64781,33.1,52.4,617.0,38.4,5.5,0.237,0.233,31.9,7.5,26.8,74.146398,92.528917,35.0,48.2,False,False,J. Baez,N. Arenado,P. Goldschmidt,J. Aguilar,F. Freeman,M. Betts,M. Brantley,J. Altuve,J. Lowrie,M. Trout,595879.0,645.0,25.9,4.5,0.366,0.34,33.1,12.6,43.3,101.960975,95.233508,33.9,57.9,571448.0,673.0,18.1,10.8,0.391,0.353,35.6,7.2,40.5,99.562139,94.062326,24.6,49.8,502671.0,690.0,25.1,13.0,0.39,0.384,37.1,13.6,44.0,101.820622,95.180346,26.4,42.4,542583.0,566.0,25.3,10.2,0.374,0.356,38.2,11.4,42.9,100.514656,94.533111,28.7,45.5,518692.0,707.0,18.7,10.7,0.378,0.388,44.7,9.3,39.8,99.091841,93.722199,24.3,55.6,605141.0,614.0,14.8,13.2,0.449,0.431,39.4,14.1,50.6,101.383522,95.486972,15.5,35.6,488726.0,630.0,9.5,7.6,0.359,0.357,38.3,3.5,38.4,98.880704,93.709531,11.0,44.2,514888.0,599.0,13.2,9.2,0.363,0.352,34.0,5.9,33.8,98.154527,93.131462,17.9,47.4,476704.0,680.0,18.8,11.5,0.347,0.327,38.0,5.3,37.9,98.082413,93.245695,20.4,45.9,545361.0,608.0,20.4,20.1,0.447,0.434,41.0,16.5,46.2,103.530154,96.114603,18.6,37.6,0.3642,0.3802,0.3798,0.393,22.62,15.34,9.84,12.32,10.82,9.06,37.74,38.14,42.1,41.38,100.590047,100.006264,94.546298,94.337653,27.58,16.68,50.24,42.14
10771,11582,401227036,True,AL,S. Ohtani,NL,M. Scherzer,2021,693.0,34.1,5.2,0.248,0.263,33.6,8.0,34.3,76.620696,93.694756,34.2,50.6,533.0,29.3,8.3,0.279,0.281,31.9,7.1,39.9,76.761769,94.11387,28.9,48.0,False,False,F. Tatis,T. Turner,M. Muncy,J. Turner,N. Arenado,S. Ohtani,J. Martinez,V. Guerrero,M. Olson,X. Bogaerts,665487.0,546.0,28.0,11.4,0.403,0.407,33.1,21.3,55.6,105.698423,97.793089,34.8,51.5,607208.0,646.0,17.0,6.3,0.386,0.364,34.6,7.4,46.2,101.311622,94.938969,23.0,49.2,571970.0,592.0,20.3,14.0,0.379,0.407,34.9,16.1,46.6,102.059033,95.545325,23.4,37.6,457759.0,612.0,16.0,10.0,0.358,0.359,36.5,7.9,42.4,99.559758,94.251536,17.7,44.5,571448.0,653.0,14.7,7.7,0.336,0.311,34.1,6.7,37.5,98.719643,93.570965,18.5,46.8,660271.0,639.0,29.6,15.0,0.393,0.408,35.4,22.3,53.6,105.900177,97.802022,35.1,45.8,502110.0,634.0,23.7,8.7,0.364,0.374,41.2,12.5,49.4,102.421754,95.88792,29.2,54.2,665489.0,698.0,15.8,12.3,0.419,0.421,33.7,15.1,55.2,106.811862,98.446352,27.8,47.3,621566.0,673.0,16.8,13.1,0.379,0.378,32.4,12.7,48.8,102.889107,95.968663,23.1,47.0,593428.0,603.0,18.7,10.3,0.368,0.359,36.4,9.7,43.0,100.949849,94.630658,22.0,45.7,0.3696,0.388,0.3724,0.3846,19.2,20.92,9.88,11.88,11.88,14.46,34.64,35.82,45.66,50.0,101.469696,103.79455,95.219977,96.547123,23.48,27.44,45.92,48.0


Realized that some all star games are in here<br/>
lets get rid of those

In [105]:
df = df[~df["away_team"].isin(['AL', 'NL'])]
print(df["away_team"].drop_duplicates().tolist())
print(len(df["away_team"].drop_duplicates().tolist()))

['NYM', 'STL', 'TOR', 'MIN', 'CHC', 'SF', 'CHW', 'SEA', 'WSH', 'PHI', 'LAD', 'COL', 'BOS', 'HOU', 'DET', 'TEX', 'MIA', 'TB', 'CLE', 'NYY', 'OAK', 'PIT', 'SD', 'BAL', 'LAA', 'CIN', 'KC', 'ATL', 'MIL', 'ARI']
30


In [106]:
location_to_abbrev = {
    'St. Louis': 'STL',
    'NY Yankees': 'NYY',
    'NY Mets': 'NYM',
    'Chi Sox': 'CHW',
    'Chi Cubs': 'CHC',
    'LA Dodgers': 'LAD',
    'LA Angels': 'LAA',
    'Boston': 'BOS',
    'Atlanta': 'ATL',
    'Cincinnati': 'CIN',
    'Cleveland': 'CLE',
    'Colorado': 'COL',
    'Detroit': 'DET',
    'Houston': 'HOU',
    'Kansas City': 'KC',
    'Miami': 'MIA',
    'Milwaukee': 'MIL',
    'Minnesota': 'MIN',
    'Sacramento': 'OAK',  # teamrankings.com updated to sacramento while our data had not
    'Philadelphia': 'PHI',
    'Pittsburgh': 'PIT',
    'San Diego': 'SD',
    'SF Giants': 'SF',
    'Seattle': 'SEA',
    'Tampa Bay': 'TB',
    'Texas': 'TEX',
    'Toronto': 'TOR',
    'Washington': 'WSH',
    'Baltimore' : 'BAL',
    'Arizona' : 'ARI'
}

# Convert the 'Location' column to abbreviations
first_inning_rpg_df['team'] = first_inning_rpg_df['team'].map(location_to_abbrev)

In [107]:
df = df.merge(
    first_inning_rpg_df,
    how='left',
    left_on=['year', 'away_team'],
    right_on=['year', 'team'],
    suffixes=('', '_away')  # Keep away stats unmodified, rename right side
).drop(columns=['team'])  # Drop duplicated 'team' column

df = df.rename(columns=lambda col: f'away_{col}' if col == 'rpg' else col)


# Merge home team stats
df = df.merge(
    first_inning_rpg_df,
    how='left',
    left_on=['year', 'home_team'],
    right_on=['year', 'team'],
    suffixes=('', '_home')
).drop(columns=['team'])

df = df.rename(columns=lambda col: f'home_{col}' if col == 'rpg' else col)

In [108]:
df[df.isnull().any(axis=1)]

Unnamed: 0,index,game_id,NRFI,away_team,away_pitcher,home_team,home_pitcher,year,home_pitcher_pa,home_pitcher_k_percent,home_pitcher_bb_percent,home_pitcher_woba,home_pitcher_xwoba,home_pitcher_sweet_spot_percent,home_pitcher_barrel_batted_rate,home_pitcher_hard_hit_percent,home_pitcher_avg_best_speed,home_pitcher_avg_hyper_speed,home_pitcher_whiff_percent,home_pitcher_swing_percent,away_pitcher_pa,away_pitcher_k_percent,away_pitcher_bb_percent,away_pitcher_woba,away_pitcher_xwoba,away_pitcher_sweet_spot_percent,away_pitcher_barrel_batted_rate,away_pitcher_hard_hit_percent,away_pitcher_avg_best_speed,away_pitcher_avg_hyper_speed,away_pitcher_whiff_percent,away_pitcher_swing_percent,home_pitcher_low_sample,away_pitcher_low_sample,home_batter_1,home_batter_2,home_batter_3,home_batter_4,home_batter_5,away_batter_1,away_batter_2,away_batter_3,away_batter_4,away_batter_5,home_batter_1_player_id,home_batter_1_pa,home_batter_1_k_percent,home_batter_1_bb_percent,home_batter_1_woba,home_batter_1_xwoba,home_batter_1_sweet_spot_percent,home_batter_1_barrel_batted_rate,home_batter_1_hard_hit_percent,home_batter_1_avg_best_speed,home_batter_1_avg_hyper_speed,home_batter_1_whiff_percent,home_batter_1_swing_percent,home_batter_2_player_id,home_batter_2_pa,home_batter_2_k_percent,home_batter_2_bb_percent,home_batter_2_woba,home_batter_2_xwoba,home_batter_2_sweet_spot_percent,home_batter_2_barrel_batted_rate,home_batter_2_hard_hit_percent,home_batter_2_avg_best_speed,home_batter_2_avg_hyper_speed,home_batter_2_whiff_percent,home_batter_2_swing_percent,home_batter_3_player_id,home_batter_3_pa,home_batter_3_k_percent,home_batter_3_bb_percent,home_batter_3_woba,home_batter_3_xwoba,home_batter_3_sweet_spot_percent,home_batter_3_barrel_batted_rate,home_batter_3_hard_hit_percent,home_batter_3_avg_best_speed,home_batter_3_avg_hyper_speed,home_batter_3_whiff_percent,home_batter_3_swing_percent,home_batter_4_player_id,home_batter_4_pa,home_batter_4_k_percent,home_batter_4_bb_percent,home_batter_4_woba,home_batter_4_xwoba,home_batter_4_sweet_spot_percent,home_batter_4_barrel_batted_rate,home_batter_4_hard_hit_percent,home_batter_4_avg_best_speed,home_batter_4_avg_hyper_speed,home_batter_4_whiff_percent,home_batter_4_swing_percent,home_batter_5_player_id,home_batter_5_pa,home_batter_5_k_percent,home_batter_5_bb_percent,home_batter_5_woba,home_batter_5_xwoba,home_batter_5_sweet_spot_percent,home_batter_5_barrel_batted_rate,home_batter_5_hard_hit_percent,home_batter_5_avg_best_speed,home_batter_5_avg_hyper_speed,home_batter_5_whiff_percent,home_batter_5_swing_percent,away_batter_1_player_id,away_batter_1_pa,away_batter_1_k_percent,away_batter_1_bb_percent,away_batter_1_woba,away_batter_1_xwoba,away_batter_1_sweet_spot_percent,away_batter_1_barrel_batted_rate,away_batter_1_hard_hit_percent,away_batter_1_avg_best_speed,away_batter_1_avg_hyper_speed,away_batter_1_whiff_percent,away_batter_1_swing_percent,away_batter_2_player_id,away_batter_2_pa,away_batter_2_k_percent,away_batter_2_bb_percent,away_batter_2_woba,away_batter_2_xwoba,away_batter_2_sweet_spot_percent,away_batter_2_barrel_batted_rate,away_batter_2_hard_hit_percent,away_batter_2_avg_best_speed,away_batter_2_avg_hyper_speed,away_batter_2_whiff_percent,away_batter_2_swing_percent,away_batter_3_player_id,away_batter_3_pa,away_batter_3_k_percent,away_batter_3_bb_percent,away_batter_3_woba,away_batter_3_xwoba,away_batter_3_sweet_spot_percent,away_batter_3_barrel_batted_rate,away_batter_3_hard_hit_percent,away_batter_3_avg_best_speed,away_batter_3_avg_hyper_speed,away_batter_3_whiff_percent,away_batter_3_swing_percent,away_batter_4_player_id,away_batter_4_pa,away_batter_4_k_percent,away_batter_4_bb_percent,away_batter_4_woba,away_batter_4_xwoba,away_batter_4_sweet_spot_percent,away_batter_4_barrel_batted_rate,away_batter_4_hard_hit_percent,away_batter_4_avg_best_speed,away_batter_4_avg_hyper_speed,away_batter_4_whiff_percent,away_batter_4_swing_percent,away_batter_5_player_id,away_batter_5_pa,away_batter_5_k_percent,away_batter_5_bb_percent,away_batter_5_woba,away_batter_5_xwoba,away_batter_5_sweet_spot_percent,away_batter_5_barrel_batted_rate,away_batter_5_hard_hit_percent,away_batter_5_avg_best_speed,away_batter_5_avg_hyper_speed,away_batter_5_whiff_percent,away_batter_5_swing_percent,home_top5_avg_xwoba,away_top5_avg_xwoba,home_top5_avg_woba,away_top5_avg_woba,home_top5_avg_k_percent,away_top5_avg_k_percent,home_top5_avg_bb_percent,away_top5_avg_bb_percent,home_top5_avg_barrel_batted_rate,away_top5_avg_barrel_batted_rate,home_top5_avg_sweet_spot_percent,away_top5_avg_sweet_spot_percent,home_top5_avg_hard_hit_percent,away_top5_avg_hard_hit_percent,home_top5_avg_avg_best_speed,away_top5_avg_avg_best_speed,home_top5_avg_avg_hyper_speed,away_top5_avg_avg_hyper_speed,home_top5_avg_whiff_percent,away_top5_avg_whiff_percent,home_top5_avg_swing_percent,away_top5_avg_swing_percent,away_rpg,home_rpg
835,893,360605105,False,KC,C. Young,CLE,C. Kluber,2016,860.0,26.4,6.6,0.275,0.282,34.4,6.0,30.4,75.860158,92.779869,28.2,48.5,406.0,23.2,10.6,0.394,0.375,36.0,12.7,37.1,79.994395,93.892737,27.0,45.5,False,False,C. Santana,J. Kipnis,F. Lindor,M. Napoli,L. Chisenhall,A. Escobar,W. Merrifield,E. Hosmer,S. Perez,T. Cruz,467793.0,688.0,14.4,14.4,0.370,0.385,29.3,9.8,43.2,101.171694,94.818326,19.2,39.2,543401.0,688.0,21.2,8.7,0.347,0.345,38.2,6.7,39.5,99.214971,93.986365,19.6,43.3,596019.0,684.0,12.9,8.3,0.340,0.338,34.6,4.1,33.5,98.275621,93.192311,17.1,47.6,435063.0,645.0,30.1,12.1,0.343,0.340,33.2,13.3,39.4,100.694645,94.626005,30.5,43.8,502082.0,418.0,16.7,5.5,0.327,0.292,33.2,2.8,24.8,96.412980,92.206837,17.4,56.7,444876.0,682.0,14.1,4.0,0.278,0.262,31.7,1.4,17.6,93.292751,90.768536,20.6,53.7,593160.0,332.0,21.7,5.7,0.309,0.292,29.9,2.9,34.4,97.985060,93.070424,20.2,48.4,543333.0,667.0,19.8,8.5,0.326,0.330,26.0,9.2,44.2,102.281034,95.409043,26.0,48.6,521692.0,546.0,21.8,4.0,0.308,0.288,29.5,6.3,37.8,99.609484,93.912413,23.4,54.8,,,,,,,,,,,,,,0.34000,0.2930,0.34540,0.30525,19.060,19.350,9.800,5.550,7.340,4.950,33.700,29.275,36.080,33.500,99.153982,98.292082,93.765969,93.290104,20.760,22.550,46.120,51.375,0.51,0.53
1077,1147,360623102,False,CHW,J. Shields,BOS,R. Porcello,2016,890.0,21.2,3.6,0.274,0.301,32.8,7.0,35.1,77.867292,93.520287,18.6,48.4,821.0,16.4,10.0,0.377,0.359,34.6,8.4,33.3,78.659148,93.438885,22.3,44.0,False,False,M. Betts,D. Pedroia,X. Bogaerts,D. Ortiz,R. LaMarre,T. Anderson,A. Eaton,J. Abreu,M. Cabrera,T. Frazier,605141.0,730.0,11.0,6.7,0.379,0.336,31.7,5.2,39.9,99.700589,94.049945,14.2,41.3,456030.0,698.0,10.5,8.7,0.358,0.331,35.3,2.5,28.9,96.143655,92.220249,12.2,43.1,593428.0,719.0,17.1,8.1,0.348,0.319,28.0,5.3,32.7,99.219484,93.661599,20.0,45.4,120074.0,626.0,13.7,12.8,0.419,0.430,39.3,15.5,47.2,103.045534,96.143184,19.4,43.5,,,,,,,,,,,,,,542881.0,42.0,26.2,9.5,0.190,0.189,18.5,0.0,7.4,87.281907,88.996708,28.0,53.6,594809.0,706.0,16.3,8.9,0.344,0.330,31.1,3.1,36.2,98.603474,93.475308,17.2,47.7,547989.0,695.0,18.0,6.8,0.349,0.348,32.7,7.9,39.4,100.621289,94.406076,23.8,51.1,408234.0,679.0,17.1,11.0,0.399,0.449,40.1,15.9,50.6,104.209911,96.808753,22.5,48.7,453943.0,666.0,24.5,9.6,0.326,0.314,31.7,10.1,35.6,99.373684,93.753147,28.3,46.7,0.35400,0.3260,0.37600,0.32160,13.075,20.420,9.075,9.160,7.125,7.400,33.575,30.820,37.175,33.840,99.527315,98.018053,94.018744,93.487998,16.450,23.960,43.325,49.560,0.51,0.71
2185,2313,360915107,True,OAK,D. Mengden,KC,E. Volquez,2016,853.0,16.3,8.9,0.343,0.347,32.2,6.3,35.5,78.467345,93.710161,19.8,45.8,332.0,21.4,9.9,0.355,0.346,32.1,5.8,35.3,76.947662,93.598884,21.9,45.1,False,False,J. Dyson,T. Gore,W. Merrifield,E. Hosmer,D. Nava,J. Wendle,D. Valencia,S. Vogt,K. Davis,M. Muncy,502481.0,337.0,11.6,7.7,0.316,0.290,26.8,0.4,21.9,95.156097,91.710174,13.5,45.9,,,,,,,,,,,,,,593160.0,332.0,21.7,5.7,0.309,0.292,29.9,2.9,34.4,97.985060,93.070424,20.2,48.4,543333.0,667.0,19.8,8.5,0.326,0.330,26.0,9.2,44.2,102.281034,95.409043,26.0,48.6,537953.0,148.0,20.3,6.8,0.266,0.315,40.8,2.9,34.0,97.270287,92.874888,16.0,44.1,621563.0,104.0,15.4,5.8,0.265,0.298,32.9,1.2,36.6,98.619570,93.611758,15.3,49.9,502143.0,517.0,22.2,7.9,0.342,0.348,35.2,7.8,41.1,101.449816,94.886325,28.2,44.1,519390.0,532.0,15.6,6.6,0.305,0.313,40.7,5.1,26.8,96.084116,92.042396,17.2,45.4,501981.0,610.0,27.2,6.9,0.349,0.369,31.5,17.0,48.2,103.007387,95.874377,33.9,51.9,571970.0,133.0,18.0,15.0,0.262,0.311,28.1,3.4,27.0,95.897581,91.995797,21.2,35.7,0.30675,0.3278,0.30425,0.30460,18.350,19.680,7.175,8.440,3.850,6.900,30.875,33.680,33.625,35.940,98.173119,99.011694,93.266132,93.682131,18.925,23.160,46.750,45.400,0.39,0.51
2455,2600,361025105,False,CHC,J. Lester,CLE,C. Kluber,2016,860.0,26.4,6.6,0.275,0.282,34.4,6.0,30.4,75.860158,92.779869,28.2,48.5,796.0,24.7,6.5,0.264,0.272,31.2,4.1,29.2,77.528217,92.686431,24.0,47.1,False,False,R. Davis,J. Kipnis,F. Lindor,M. Napoli,C. Santana,D. Fowler,K. Bryant,A. Rizzo,B. Zobrist,K. Schwarber,434658.0,495.0,21.4,6.7,0.302,0.283,29.9,3.4,29.9,96.624117,92.325844,24.1,52.1,543401.0,688.0,21.2,8.7,0.347,0.345,38.2,6.7,39.5,99.214971,93.986365,19.6,43.3,596019.0,684.0,12.9,8.3,0.340,0.338,34.6,4.1,33.5,98.275621,93.192311,17.1,47.6,435063.0,645.0,30.1,12.1,0.343,0.340,33.2,13.3,39.4,100.694645,94.626005,30.5,43.8,467793.0,688.0,14.4,14.4,0.370,0.385,29.3,9.8,43.2,101.171694,94.818326,19.2,39.2,451594.0,551.0,22.5,14.3,0.367,0.350,37.1,5.6,33.8,98.048856,93.030672,22.0,39.2,592178.0,699.0,22.0,10.7,0.396,0.386,36.3,11.7,38.9,100.570830,94.490281,28.1,48.5,519203.0,676.0,16.0,10.9,0.391,0.363,31.6,9.2,36.2,99.883568,93.995997,19.8,44.9,450314.0,631.0,13.0,15.2,0.360,0.357,32.5,4.5,35.6,98.199112,93.170031,12.8,35.5,,,,,,,,,,,,,,0.33820,0.3640,0.34040,0.37850,20.000,18.375,10.040,12.775,7.460,7.750,33.040,34.375,37.100,36.125,99.196210,99.175591,93.789770,93.671745,22.100,20.675,45.200,42.025,0.67,0.53
2456,2601,361026105,False,CHC,J. Arrieta,CLE,T. Bauer,2016,811.0,20.7,8.6,0.310,0.325,30.3,5.7,39.7,79.530938,94.059035,22.5,44.2,795.0,23.9,9.6,0.261,0.291,30.8,4.8,29.6,76.361267,92.514069,25.2,45.6,False,False,C. Santana,J. Kipnis,F. Lindor,M. Napoli,J. Ramirez,D. Fowler,K. Bryant,A. Rizzo,B. Zobrist,K. Schwarber,467793.0,688.0,14.4,14.4,0.370,0.385,29.3,9.8,43.2,101.171694,94.818326,19.2,39.2,543401.0,688.0,21.2,8.7,0.347,0.345,38.2,6.7,39.5,99.214971,93.986365,19.6,43.3,596019.0,684.0,12.9,8.3,0.340,0.338,34.6,4.1,33.5,98.275621,93.192311,17.1,47.6,435063.0,645.0,30.1,12.1,0.343,0.340,33.2,13.3,39.4,100.694645,94.626005,30.5,43.8,608070.0,618.0,10.0,7.1,0.355,0.331,35.0,3.0,32.9,97.426160,92.831258,12.7,44.0,451594.0,551.0,22.5,14.3,0.367,0.350,37.1,5.6,33.8,98.048856,93.030672,22.0,39.2,592178.0,699.0,22.0,10.7,0.396,0.386,36.3,11.7,38.9,100.570830,94.490281,28.1,48.5,519203.0,676.0,16.0,10.9,0.391,0.363,31.6,9.2,36.2,99.883568,93.995997,19.8,44.9,450314.0,631.0,13.0,15.2,0.360,0.357,32.5,4.5,35.6,98.199112,93.170031,12.8,35.5,,,,,,,,,,,,,,0.34780,0.3640,0.35100,0.37850,17.720,18.375,10.120,12.775,7.380,7.750,34.060,34.375,37.700,36.125,99.356618,99.175591,93.890853,93.671745,19.820,20.675,43.580,42.025,0.67,0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12296,13264,401228645,False,OAK,D. Jefferies,LAA,R. Detmers,2021,101.0,18.8,10.9,0.390,0.357,36.2,11.6,33.3,78.924188,93.435800,26.8,48.3,58.0,13.8,6.9,0.278,0.305,37.8,6.7,35.6,79.791709,93.041794,17.0,50.9,False,True,D. Fletcher,S. Ohtani,J. Upton,M. Stassi,M. Thaiss,M. Canha,S. Marte,M. Olson,J. Lowrie,R. Laureano,664058.0,665.0,9.0,4.7,0.273,0.280,30.4,0.0,15.7,92.661183,90.514600,9.5,46.5,660271.0,639.0,29.6,15.0,0.393,0.408,35.4,22.3,53.6,105.900177,97.802022,35.1,45.8,457708.0,362.0,29.6,10.8,0.305,0.322,30.4,11.7,41.6,101.713944,95.160524,31.4,45.1,545358.0,319.0,31.7,8.8,0.327,0.323,35.2,11.0,42.5,100.699397,94.626615,31.5,44.5,,,,,,,,,,,,,,592192.0,625.0,20.5,12.3,0.333,0.332,36.1,7.1,35.4,97.938321,93.004303,21.5,40.5,516782.0,526.0,18.8,8.2,0.364,0.347,32.6,8.4,39.6,100.239517,94.207705,24.6,49.5,621566.0,673.0,16.8,13.1,0.379,0.378,32.4,12.7,48.8,102.889107,95.968663,23.1,47.0,476704.0,512.0,21.1,9.6,0.311,0.343,36.5,9.3,45.9,100.200115,94.609258,24.2,46.7,657656.0,378.0,25.9,7.1,0.327,0.335,37.7,10.7,40.2,100.500252,94.372962,27.0,48.4,0.33325,0.3470,0.32450,0.34280,24.975,20.620,9.825,10.060,11.250,9.640,32.850,35.060,38.350,41.980,100.243675,100.353462,94.525940,94.432578,26.875,24.080,45.475,46.420,0.49,0.55
12470,13458,401228823,True,CLE,T. McKenzie,DET,D. Hutchison,2021,91.0,11.0,12.1,0.305,0.365,28.6,5.7,44.3,80.603417,95.276315,23.3,48.1,495.0,27.5,11.7,0.296,0.322,36.2,9.7,42.3,80.432045,94.502812,27.9,48.4,True,False,R. Grossman,J. Robson,J. Schoop,M. Cabrera,J. Candelario,M. Straw,A. Rosario,J. Ramirez,F. Reyes,B. Zimmer,543257.0,671.0,23.1,14.6,0.337,0.334,39.5,7.6,34.1,97.650888,92.935304,21.6,38.4,,,,,,,,,,,,,,570731.0,674.0,19.7,5.5,0.324,0.311,30.1,6.8,40.2,101.189262,94.661372,24.7,52.5,408234.0,526.0,22.4,7.6,0.305,0.315,32.8,8.3,49.0,101.953387,95.613068,25.8,49.4,600869.0,626.0,21.6,10.4,0.344,0.356,38.6,9.0,39.1,99.820429,94.030827,23.5,47.6,664702.0,638.0,19.0,10.5,0.311,0.303,42.6,1.3,26.2,95.725614,91.886357,13.6,40.4,642708.0,588.0,20.4,5.3,0.315,0.300,32.0,2.8,43.5,100.036134,94.229745,24.8,48.9,608070.0,636.0,13.7,11.3,0.372,0.376,33.4,11.1,42.4,100.831040,94.777982,14.9,43.0,614177.0,466.0,32.0,9.2,0.354,0.353,31.3,16.9,48.5,104.674993,96.702060,33.7,45.4,605548.0,348.0,35.1,8.6,0.300,0.317,30.4,9.4,42.5,102.513402,95.475187,36.4,44.8,0.32900,0.3298,0.32750,0.33040,21.700,24.040,9.525,8.980,7.925,8.300,35.250,33.940,40.600,40.620,100.153491,100.756237,94.310143,94.614266,23.900,24.680,46.975,44.500,0.57,0.34
13089,14148,401229455,False,CHC,A. Sampson,STL,J. Lester,2021,627.0,14.5,8.8,0.352,0.343,33.5,7.7,37.2,76.810551,93.976800,20.1,46.4,145.0,19.3,5.5,0.329,0.339,31.1,11.7,42.7,77.678274,94.615069,21.3,50.0,False,False,H. Bader,P. Goldschmidt,M. Carpenter,T. O'Neill,N. Arenado,W. Contreras,E. Castillo,F. Schwindel,I. Happ,M. Duffy,664056.0,401.0,21.2,6.7,0.331,0.290,27.1,7.0,31.7,98.203918,93.140673,24.7,47.5,502671.0,679.0,20.0,9.9,0.373,0.397,36.9,13.6,50.6,102.744109,96.119573,24.9,44.2,572761.0,249.0,30.9,14.1,0.269,0.340,45.0,11.5,42.0,99.459286,94.093494,29.7,38.7,641933.0,537.0,31.3,7.1,0.384,0.389,39.9,17.9,52.2,104.172238,97.016213,34.7,49.8,571448.0,653.0,14.7,7.7,0.336,0.311,34.1,6.7,37.5,98.719643,93.570965,18.5,46.8,575929.0,483.0,28.6,10.8,0.337,0.346,30.1,11.1,48.4,103.658236,96.307225,34.7,45.1,,,,,,,,,,,,,,643524.0,259.0,15.8,6.2,0.403,0.332,32.8,8.0,39.8,100.182661,94.153569,23.2,52.1,664023.0,535.0,29.2,11.6,0.328,0.317,30.8,10.9,41.3,100.682785,94.566083,32.3,43.9,622110.0,322.0,19.6,7.8,0.327,0.325,40.5,4.0,34.8,97.109603,92.701935,15.4,42.4,0.34540,0.3300,0.33860,0.34875,23.620,23.300,9.100,9.100,11.340,8.500,36.600,33.550,42.800,41.075,100.659839,100.408321,94.788183,94.432203,26.500,26.400,45.400,45.875,0.52,0.69
13195,14264,401246352,True,MIL,B. Woodruff,LAD,C. Kershaw,2020,221.0,28.1,3.6,0.251,0.271,29.3,8.0,34.0,76.446134,93.625258,27.8,49.8,293.0,31.1,6.1,0.264,0.264,26.1,7.2,33.9,74.869262,93.347296,28.9,48.3,False,False,M. Betts,C. Seager,J. Turner,M. Muncy,W. Smith,A. Garcia,C. Yelich,J. Gyorko,R. Healy,O. Arcia,605141.0,246.0,15.4,9.8,0.390,0.362,40.1,7.7,43.4,99.966804,94.311948,13.8,40.5,608369.0,232.0,15.9,7.3,0.394,0.434,38.4,15.8,55.9,103.805089,96.925113,25.8,55.6,457759.0,175.0,14.9,10.3,0.376,0.403,40.8,11.2,44.0,99.373362,94.028252,20.5,42.9,571970.0,248.0,24.2,15.7,0.316,0.360,29.0,12.4,40.0,99.706862,94.042932,26.8,36.6,669257.0,137.0,16.1,14.6,0.411,0.405,41.9,12.9,47.3,100.232230,94.728068,15.5,38.2,541645.0,207.0,23.7,9.7,0.296,0.317,37.1,3.8,37.1,99.290871,93.692910,34.7,54.6,592885.0,247.0,30.8,18.6,0.343,0.378,32.3,12.1,55.6,103.843597,97.020651,33.6,34.6,576397.0,135.0,28.1,11.1,0.351,0.357,35.8,16.0,43.2,99.834025,94.201655,31.8,42.2,,,,,,,,,,,,,,606115.0,189.0,16.9,7.4,0.317,0.338,37.3,5.6,38.7,99.346351,93.890230,23.1,46.8,0.39280,0.3475,0.37740,0.32675,17.300,24.875,11.540,11.700,12.000,9.375,38.040,35.625,46.120,43.650,100.616869,100.578711,94.807262,94.701362,20.480,30.800,42.760,44.550,0.24,0.74


Based on a few rows I've looked at, these leftover 93 rows of nan contain players who are not in the batter_stats csv so we can drop them comfortably

In [109]:
df = df.dropna()
df.head()

Unnamed: 0,index,game_id,NRFI,away_team,away_pitcher,home_team,home_pitcher,year,home_pitcher_pa,home_pitcher_k_percent,home_pitcher_bb_percent,home_pitcher_woba,home_pitcher_xwoba,home_pitcher_sweet_spot_percent,home_pitcher_barrel_batted_rate,home_pitcher_hard_hit_percent,home_pitcher_avg_best_speed,home_pitcher_avg_hyper_speed,home_pitcher_whiff_percent,home_pitcher_swing_percent,away_pitcher_pa,away_pitcher_k_percent,away_pitcher_bb_percent,away_pitcher_woba,away_pitcher_xwoba,away_pitcher_sweet_spot_percent,away_pitcher_barrel_batted_rate,away_pitcher_hard_hit_percent,away_pitcher_avg_best_speed,away_pitcher_avg_hyper_speed,away_pitcher_whiff_percent,away_pitcher_swing_percent,home_pitcher_low_sample,away_pitcher_low_sample,home_batter_1,home_batter_2,home_batter_3,home_batter_4,home_batter_5,away_batter_1,away_batter_2,away_batter_3,away_batter_4,away_batter_5,home_batter_1_player_id,home_batter_1_pa,home_batter_1_k_percent,home_batter_1_bb_percent,home_batter_1_woba,home_batter_1_xwoba,home_batter_1_sweet_spot_percent,home_batter_1_barrel_batted_rate,home_batter_1_hard_hit_percent,home_batter_1_avg_best_speed,home_batter_1_avg_hyper_speed,home_batter_1_whiff_percent,home_batter_1_swing_percent,home_batter_2_player_id,home_batter_2_pa,home_batter_2_k_percent,home_batter_2_bb_percent,home_batter_2_woba,home_batter_2_xwoba,home_batter_2_sweet_spot_percent,home_batter_2_barrel_batted_rate,home_batter_2_hard_hit_percent,home_batter_2_avg_best_speed,home_batter_2_avg_hyper_speed,home_batter_2_whiff_percent,home_batter_2_swing_percent,home_batter_3_player_id,home_batter_3_pa,home_batter_3_k_percent,home_batter_3_bb_percent,home_batter_3_woba,home_batter_3_xwoba,home_batter_3_sweet_spot_percent,home_batter_3_barrel_batted_rate,home_batter_3_hard_hit_percent,home_batter_3_avg_best_speed,home_batter_3_avg_hyper_speed,home_batter_3_whiff_percent,home_batter_3_swing_percent,home_batter_4_player_id,home_batter_4_pa,home_batter_4_k_percent,home_batter_4_bb_percent,home_batter_4_woba,home_batter_4_xwoba,home_batter_4_sweet_spot_percent,home_batter_4_barrel_batted_rate,home_batter_4_hard_hit_percent,home_batter_4_avg_best_speed,home_batter_4_avg_hyper_speed,home_batter_4_whiff_percent,home_batter_4_swing_percent,home_batter_5_player_id,home_batter_5_pa,home_batter_5_k_percent,home_batter_5_bb_percent,home_batter_5_woba,home_batter_5_xwoba,home_batter_5_sweet_spot_percent,home_batter_5_barrel_batted_rate,home_batter_5_hard_hit_percent,home_batter_5_avg_best_speed,home_batter_5_avg_hyper_speed,home_batter_5_whiff_percent,home_batter_5_swing_percent,away_batter_1_player_id,away_batter_1_pa,away_batter_1_k_percent,away_batter_1_bb_percent,away_batter_1_woba,away_batter_1_xwoba,away_batter_1_sweet_spot_percent,away_batter_1_barrel_batted_rate,away_batter_1_hard_hit_percent,away_batter_1_avg_best_speed,away_batter_1_avg_hyper_speed,away_batter_1_whiff_percent,away_batter_1_swing_percent,away_batter_2_player_id,away_batter_2_pa,away_batter_2_k_percent,away_batter_2_bb_percent,away_batter_2_woba,away_batter_2_xwoba,away_batter_2_sweet_spot_percent,away_batter_2_barrel_batted_rate,away_batter_2_hard_hit_percent,away_batter_2_avg_best_speed,away_batter_2_avg_hyper_speed,away_batter_2_whiff_percent,away_batter_2_swing_percent,away_batter_3_player_id,away_batter_3_pa,away_batter_3_k_percent,away_batter_3_bb_percent,away_batter_3_woba,away_batter_3_xwoba,away_batter_3_sweet_spot_percent,away_batter_3_barrel_batted_rate,away_batter_3_hard_hit_percent,away_batter_3_avg_best_speed,away_batter_3_avg_hyper_speed,away_batter_3_whiff_percent,away_batter_3_swing_percent,away_batter_4_player_id,away_batter_4_pa,away_batter_4_k_percent,away_batter_4_bb_percent,away_batter_4_woba,away_batter_4_xwoba,away_batter_4_sweet_spot_percent,away_batter_4_barrel_batted_rate,away_batter_4_hard_hit_percent,away_batter_4_avg_best_speed,away_batter_4_avg_hyper_speed,away_batter_4_whiff_percent,away_batter_4_swing_percent,away_batter_5_player_id,away_batter_5_pa,away_batter_5_k_percent,away_batter_5_bb_percent,away_batter_5_woba,away_batter_5_xwoba,away_batter_5_sweet_spot_percent,away_batter_5_barrel_batted_rate,away_batter_5_hard_hit_percent,away_batter_5_avg_best_speed,away_batter_5_avg_hyper_speed,away_batter_5_whiff_percent,away_batter_5_swing_percent,home_top5_avg_xwoba,away_top5_avg_xwoba,home_top5_avg_woba,away_top5_avg_woba,home_top5_avg_k_percent,away_top5_avg_k_percent,home_top5_avg_bb_percent,away_top5_avg_bb_percent,home_top5_avg_barrel_batted_rate,away_top5_avg_barrel_batted_rate,home_top5_avg_sweet_spot_percent,away_top5_avg_sweet_spot_percent,home_top5_avg_hard_hit_percent,away_top5_avg_hard_hit_percent,home_top5_avg_avg_best_speed,away_top5_avg_avg_best_speed,home_top5_avg_avg_hyper_speed,away_top5_avg_avg_hyper_speed,home_top5_avg_whiff_percent,away_top5_avg_whiff_percent,home_top5_avg_swing_percent,away_top5_avg_swing_percent,away_rpg,home_rpg
0,0,360403107,False,NYM,M. Harvey,KC,E. Volquez,2016,853.0,16.3,8.9,0.343,0.347,32.2,6.3,35.5,78.467345,93.710161,19.8,45.8,402.0,18.9,6.2,0.341,0.315,38.0,4.3,37.0,77.798122,93.65871,21.8,50.9,False,False,A. Escobar,M. Moustakas,L. Cain,E. Hosmer,K. Morales,C. Granderson,D. Wright,Y. Cespedes,L. Duda,N. Walker,444876.0,682.0,14.1,4.0,0.278,0.262,31.7,1.4,17.6,93.292751,90.768536,20.6,53.7,519058.0,113.0,11.5,8.0,0.339,0.367,31.9,8.8,42.9,102.207687,95.369685,16.2,42.2,456715.0,434.0,19.4,7.1,0.322,0.319,35.6,4.4,33.4,99.101392,93.598155,23.2,47.7,543333.0,667.0,19.8,8.5,0.326,0.33,26.0,9.2,44.2,102.281034,95.409043,26.0,48.6,434778.0,618.0,19.4,7.8,0.339,0.389,34.5,10.8,49.9,102.729618,95.980574,24.0,48.0,434158.0,633.0,20.5,11.7,0.339,0.333,33.8,6.9,33.1,97.844559,93.013355,22.0,36.5,431151.0,163.0,33.7,16.0,0.344,0.377,48.8,19.5,46.3,102.290324,95.601864,36.5,38.2,493316.0,543.0,19.9,9.4,0.369,0.374,32.9,9.8,47.5,102.453599,95.709705,24.7,47.0,446263.0,172.0,20.9,8.7,0.304,0.351,38.7,10.1,45.4,102.014903,95.235385,22.0,41.0,435522.0,458.0,18.3,9.2,0.351,0.353,36.6,7.3,37.5,98.645219,93.51196,19.9,46.7,0.3334,0.3576,0.3208,0.3414,16.84,22.66,7.08,11.0,6.92,10.72,31.94,38.16,37.6,41.96,99.922496,100.649721,94.225198,94.614454,22.0,25.02,48.04,41.88,0.45,0.51
1,1,360403123,True,STL,A. Wainwright,PIT,F. Liriano,2016,731.0,23.0,11.6,0.336,0.32,28.6,6.4,33.3,78.130932,93.357774,28.2,43.3,847.0,19.0,7.0,0.335,0.329,37.5,5.9,30.2,76.527704,92.663444,20.2,44.1,False,False,J. Jaso,A. McCutchen,D. Freese,S. Marte,F. Cervelli,M. Carpenter,M. Adams,M. Holliday,R. Grichuk,S. Piscotty,444379.0,432.0,17.1,10.4,0.335,0.332,32.8,4.9,32.1,97.253425,92.646669,17.2,40.7,457705.0,675.0,21.2,10.2,0.329,0.343,32.1,8.5,41.7,100.058442,94.291172,25.5,46.2,501896.0,492.0,28.9,9.1,0.334,0.324,31.2,8.1,48.5,101.73211,95.372984,27.8,44.4,516782.0,529.0,19.7,4.3,0.351,0.332,35.8,5.4,37.0,99.619785,93.866708,25.6,54.2,465041.0,393.0,18.3,14.2,0.318,0.325,27.4,1.5,30.9,97.284936,92.673323,22.2,38.1,572761.0,566.0,19.1,14.3,0.375,0.397,43.0,11.3,41.9,99.808493,94.367605,19.3,38.5,571431.0,327.0,24.8,7.6,0.331,0.325,38.8,11.0,40.2,99.7008,94.17938,26.9,50.9,407812.0,426.0,16.7,8.2,0.335,0.351,27.9,8.0,47.8,103.627355,96.375906,20.8,48.3,545341.0,478.0,29.5,5.9,0.325,0.321,27.8,9.8,46.1,102.246731,95.409579,30.3,53.8,572039.0,648.0,20.5,7.9,0.345,0.343,33.4,8.6,37.8,99.313961,93.753641,25.8,52.4,0.3312,0.3474,0.3334,0.3422,21.04,22.12,9.64,8.78,5.68,9.74,31.86,34.18,38.04,42.76,99.18974,100.939468,93.770171,94.817222,23.66,24.62,44.72,48.78,0.43,0.64
2,2,360403130,False,TOR,M. Stroman,TB,C. Archer,2016,850.0,27.4,7.9,0.304,0.302,29.7,8.4,40.5,80.179654,94.340096,29.6,45.5,854.0,19.4,6.3,0.312,0.306,26.5,4.9,41.9,79.928197,94.684735,21.5,47.3,False,False,L. Forsythe,L. Morrison,E. Longoria,C. Dickerson,D. Jennings,K. Pillar,J. Donaldson,J. Bautista,E. Encarnacion,T. Tulowitzki,523253.0,567.0,22.4,8.1,0.336,0.352,38.9,7.5,38.9,99.05198,93.678695,22.7,39.6,489149.0,398.0,22.4,9.3,0.318,0.338,32.0,7.5,41.7,101.63145,95.050931,24.4,46.6,446334.0,685.0,21.0,6.1,0.35,0.361,38.1,12.1,41.5,100.450586,94.47694,26.1,48.5,572816.0,548.0,24.5,6.0,0.319,0.304,34.1,11.1,38.1,99.143156,93.681686,29.1,55.9,457775.0,225.0,25.8,9.3,0.278,0.281,27.8,4.9,33.3,97.731985,92.891593,31.0,43.0,607680.0,584.0,15.4,4.1,0.295,0.289,27.8,3.0,30.2,98.179315,93.089802,18.5,49.6,518626.0,700.0,17.0,15.6,0.403,0.403,32.4,12.3,49.2,102.982235,96.025999,24.1,41.7,430832.0,517.0,19.9,16.8,0.355,0.36,30.6,8.3,45.7,102.858748,95.786249,21.2,36.7,429665.0,701.0,19.7,12.4,0.373,0.38,35.9,12.3,40.8,100.992647,94.735781,23.6,42.4,453064.0,544.0,18.6,7.9,0.327,0.351,33.7,9.6,40.8,99.968194,94.172079,20.9,44.6,0.3272,0.3566,0.3202,0.3506,23.22,18.12,7.76,11.36,8.62,9.1,34.18,32.08,38.7,41.34,99.601831,100.996228,93.955969,94.761982,26.66,21.66,46.72,43.0,0.64,0.54
3,3,360404101,True,MIN,E. Santana,BAL,C. Tillman,2016,715.0,19.6,9.2,0.317,0.33,34.7,7.8,35.3,79.233002,93.50001,21.9,45.7,748.0,19.9,7.1,0.296,0.313,32.8,5.9,31.7,78.073855,93.358088,22.9,47.2,False,False,M. Machado,A. Jones,C. Davis,M. Trumbo,M. Wieters,B. Dozier,J. Mauer,M. Sano,T. Plouffe,E. Rosario,592518.0,696.0,17.2,6.9,0.366,0.366,34.9,9.3,43.2,102.442245,95.431908,22.3,49.3,430945.0,672.0,17.1,5.8,0.319,0.328,31.8,8.0,34.7,99.066222,93.61254,26.2,60.4,448801.0,665.0,32.9,13.2,0.34,0.353,35.7,15.1,44.0,101.56511,95.133378,37.1,42.6,444432.0,667.0,25.5,7.6,0.358,0.37,31.8,15.6,44.7,104.542708,96.586321,29.3,49.3,446308.0,464.0,18.3,6.9,0.307,0.324,34.5,7.3,33.0,97.647514,92.901723,23.6,51.1,572821.0,691.0,20.0,8.8,0.37,0.329,31.2,8.3,36.2,98.504496,93.326306,23.4,43.7,408045.0,576.0,16.1,13.7,0.327,0.357,36.5,6.0,42.9,99.073111,93.903651,15.3,35.5,593934.0,495.0,36.0,10.9,0.334,0.328,32.4,14.1,49.2,102.745568,96.106948,35.7,41.0,461858.0,344.0,17.4,5.5,0.311,0.328,29.3,6.5,40.3,100.189273,94.348746,19.8,45.4,592696.0,354.0,25.7,3.4,0.304,0.271,33.3,4.8,31.7,97.104328,92.584275,28.7,57.5,0.3482,0.3226,0.338,0.3292,22.2,23.04,8.08,8.46,11.06,7.94,33.74,32.54,39.92,40.06,101.05276,99.523355,94.733174,94.053985,27.7,24.58,50.54,44.62,0.48,0.53
4,4,360404103,False,CHC,J. Arrieta,LAA,G. Richards,2016,148.0,23.0,10.1,0.3,0.323,34.7,5.1,38.8,78.102554,94.00894,24.0,46.8,795.0,23.9,9.6,0.261,0.291,30.8,4.8,29.6,76.361267,92.514069,25.2,45.6,False,False,Y. Escobar,D. Nava,M. Trout,A. Pujols,K. Calhoun,D. Fowler,J. Heyward,B. Zobrist,A. Rizzo,K. Bryant,488862.0,567.0,11.8,7.1,0.327,0.316,27.8,2.8,36.1,98.884,93.515528,16.7,48.7,537953.0,148.0,20.3,6.8,0.266,0.315,40.8,2.9,34.0,97.270287,92.874888,16.0,44.1,545361.0,681.0,20.1,17.0,0.418,0.427,37.6,14.4,40.3,102.379849,95.442651,20.7,38.3,405395.0,650.0,11.5,7.5,0.331,0.367,31.5,9.2,44.1,101.520634,95.10741,15.7,45.8,594777.0,672.0,17.6,10.0,0.34,0.35,36.2,6.2,40.5,99.68832,94.056377,25.3,49.5,451594.0,551.0,22.5,14.3,0.367,0.35,37.1,5.6,33.8,98.048856,93.030672,22.0,39.2,518792.0,592.0,15.7,9.1,0.282,0.301,28.9,3.0,30.9,97.470768,92.752996,17.0,41.5,450314.0,631.0,13.0,15.2,0.36,0.357,32.5,4.5,35.6,98.199112,93.170031,12.8,35.5,519203.0,676.0,16.0,10.9,0.391,0.363,31.6,9.2,36.2,99.883568,93.995997,19.8,44.9,592178.0,699.0,22.0,10.7,0.396,0.386,36.3,11.7,38.9,100.57083,94.490281,28.1,48.5,0.355,0.3514,0.3364,0.3592,16.26,17.84,9.68,12.04,7.1,6.8,34.78,33.28,39.0,35.08,99.948618,98.834627,94.199371,93.487995,18.88,19.94,45.28,41.92,0.67,0.64


Only down to 13255 at the end of it all, pretty good

# Build Model

In [110]:
# prepare df for model
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_encoded = df

string_cols = df_encoded.select_dtypes(include=['object']).columns
string_cols = string_cols.drop(["away_team", "home_team"])
df_encoded = df_encoded.drop(columns=string_cols)


for col in ["away_team", "home_team"]:
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [111]:
# import seaborn as sns

# sns.pairplot(df_encoded, hue='NRFI')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X = df_encoded.iloc[:,3:]
y = df_encoded.NRFI

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


decision_sklearn = DecisionTreeClassifier(
    max_depth=4, 
    min_samples_leaf=10,  # regularization
    max_leaf_nodes=20,    # cap complexity
    random_state=42
)
decision_sklearn = decision_sklearn.fit(X_train, y_train)
y_pred = decision_sklearn.predict(X_test)

print("Train acc:", decision_sklearn.score(X_train, y_train))
print("Test acc:", decision_sklearn.score(X_test, y_test))

Train acc: 0.5544298340159517
Test acc: 0.5267789791299975
