In [1176]:
import pandas as pd
from sqlalchemy import text
from create_il_movement import setup_injury_parser
import os
from sqlalchemy import create_engine, text

In [1177]:
DATABASE_URL = "postgresql://ryan:cloude1379@127.0.0.1/baseball"
engine = create_engine(DATABASE_URL)

In [1178]:
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version();"))
        print("✅ Connected to PostgreSQL!")
        print(f"PostgreSQL version: {result.fetchone()[0]}")
except Exception as e:
    print("Failed to connect to PostgreSQL:")
    print(e)
    exit()

✅ Connected to PostgreSQL!
PostgreSQL version: PostgreSQL 13.21 (Debian 13.21-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [1179]:
query = "SELECT * FROM transactions where descr is not null and player is not null and player_id is not null"
with engine.connect() as conn:
    result = conn.execute(text(query))
    transactions = pd.DataFrame(result.fetchall(), columns=result.keys())

In [1180]:
def run_sql_file(filepath, engine):
    with open(filepath, 'r') as file:
        query = file.read()
    return pd.read_sql(query, engine)

In [1181]:
season_start_end = run_sql_file('sql/get_seasons_sart_end.sql', engine)

In [1183]:
transactions['date'] = pd.to_datetime(transactions['date'])
transactions['effective_date'] = pd.to_datetime(transactions['effective_date'])

In [1184]:
transactions['descr'] = transactions['descr'].str.lower()

In [1185]:
# Step 2: Mark IL placements only (ignoring activations/transfers)
transactions['is_il_placement'] = (
    (transactions['descr'].str.contains("injured list|disabled list")) &
    ~(transactions['descr'].str.contains("activated|reinstated|returned|transferred|recalled"))
)

In [1186]:
# Step 3: Identify if the previous transaction was also an IL placement
transactions['prev_il'] = (
    transactions.groupby('player_id')['is_il_placement']
    .shift(1)
    .fillna(False)
)

  .fillna(False)


In [1187]:
# Step 4: Keep only IL placements where the previous transaction was NOT also an IL placement
injury = transactions[
    (transactions['is_il_placement']) & (~transactions['prev_il'])
].copy()

In [1188]:
nlp, parse_injury_text = setup_injury_parser()

In [1189]:
# Step 1: Apply the NLP parser to each row
parsed = injury["descr"].apply(parse_injury_text)

# Step 2: Convert list of dicts to DataFrame
parsed_df = pd.DataFrame(parsed.tolist())

# Step 3: Concatenate the new columns
injury = pd.concat([injury.reset_index(drop=True), parsed_df.reset_index(drop=True)], axis=1)

In [1190]:
off_il = transactions[((transactions['descr'].str.contains("activated|reinsated|recalled|returned|alternate")) | (transactions['typecode'].isin(['DFA','REL','RET','OUT','DEI','DEC','OPT'])))
& (~transactions['descr'].str.contains("all-stars"))]

In [1191]:
off_il['effective_date'] = off_il['effective_date'].fillna(off_il['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  off_il['effective_date'] = off_il['effective_date'].fillna(off_il['date'])


In [1192]:
injury = injury.drop(['is_il_placement','prev_il'],axis=1)

In [1193]:
off_il = off_il.rename(columns={'effective_date':'effective_date_return'})

In [1194]:
off_il = off_il[['trans_id','effective_date_return','team', 'player', 'player_id', 'descr']]


In [1195]:
injury = injury.sort_values(['effective_date']).reset_index(drop=True)
off_il = off_il.sort_values(['effective_date_return']).reset_index(drop=True)
injury = pd.merge_asof(injury, off_il, by='player_id' ,left_on='effective_date', right_on='effective_date_return',
suffixes=('', '_return'), direction='forward')


In [1196]:
injury = injury.sort_values(['player_id', 'effective_date'])

In [None]:
injury['next_il_placement'] = injury.groupby(['player_id'])['effective_date'].shift(-1)
injury['next_il_team'] = injury.groupby(['player_id'])['team'].shift(-1)
injury['next_il_return_date'] = injury.groupby(['player_id'])['effective_date_return'].shift(-1)

In [1198]:
injury['season'] = injury['effective_date'].dt.year

In [1200]:
injury = injury.merge(season_start_end[['season', 'final_postseason','team']], how='left', left_on=['season','team_return'], right_on=['season','team'])

In [1201]:
injury = injury.drop(['team_y'],axis=1)

In [1202]:
injury['next_season'] = injury['season'] + 1

In [1203]:
injury = injury.merge(season_start_end[['season','opening_day','team']], how='left', \
left_on=['next_season','next_il_team'], right_on=['season','team'])

In [None]:
injury = injury.drop(['season_x','next_season','season_y'],axis=1)

In [1235]:
injury = injury.rename(columns={'opening_day':'next_opening_day'})
injury['next_opening_day'] = pd.to_datetime(injury['next_opening_day'])

In [None]:
injury['final_postseason'] = pd.to_datetime(injury['final_postseason'])

In [1222]:
injury['offseason_activation'] = injury.effective_date_return.between(injury.final_postseason, injury.final_postseason + pd.Timedelta(7,unit='days'))

In [1237]:
(injury.next_il_placement <= injury.next_opening_day) & (injury.next_il_placement.dt.year == injury.next_opening_day.dt.year)

0       False
1       False
2       False
3       False
4       False
        ...  
8187    False
8188    False
8189    False
8190    False
8191    False
Length: 8192, dtype: bool

In [1238]:
injury['pre_od_placement'] = (injury.next_il_placement <= injury.next_opening_day) & (injury.next_il_placement.dt.year == injury.next_opening_day.dt.year)

In [1244]:
injury[injury['pre_od_placement'] & injury['offseason_activation']]

Unnamed: 0,trans_id,effective_date,date,typecode,typedesc,team_x,player,player_id,descr,side,...,player_return,descr_return,next_il_placement,next_il_team,next_il_return_date,final_postseason,next_opening_day,team,offseason_activation,pre_od_placement
160,279798,2016-07-19,2016-07-20,SC,Status Change,Texas Rangers,Prince Fielder,425902,texas rangers placed dh prince fielder on the ...,left,...,Prince Fielder,texas rangers activated dh prince fielder from...,2017-02-14,Texas Rangers,2017-10-04,2016-11-02,2017-04-03,Texas Rangers,True,True
227,268297,2016-05-30,2016-06-03,SC,Status Change,New York Mets,David Wright,431151,new york mets placed 3b david wright on the 15...,,...,David Wright,new york mets activated 3b david wright from t...,2017-03-30,New York Mets,2017-11-03,2016-11-02,2017-04-03,New York Mets,True,True
228,302566,2017-03-30,2017-04-02,SC,Status Change,New York Mets,David Wright,431151,new york mets placed 3b david wright on the 10...,,...,David Wright,new york mets activated 3b david wright from t...,2018-03-26,New York Mets,2018-09-25,2017-11-01,2018-03-29,New York Mets,True,True
271,281877,2016-08-01,2016-08-02,SC,Status Change,Los Angeles Angels,Huston Street,434718,los angeles angels placed rhp huston street on...,right,...,Huston Street,los angeles angels activated rhp huston street...,2017-03-30,Los Angeles Angels,2017-06-22,2016-11-02,2017-04-03,Los Angeles Angels,True,True
283,354581,2018-05-19,2018-05-20,SC,Status Change,Washington Nationals,Howie Kendrick,435062,washington nationals placed 2b howie kendrick ...,right,...,Howie Kendrick,washington nationals activated 2b howie kendri...,2019-03-25,Washington Nationals,2019-04-04,2018-10-28,2019-03-28,Washington Nationals,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8100,794034,2024-08-11,2024-08-11,SC,Status Change,Los Angeles Dodgers,River Ryan,689981,los angeles dodgers placed rhp river ryan on t...,right,...,River Ryan,los angeles dodgers activated rhp river ryan f...,2025-02-13,Los Angeles Dodgers,NaT,2024-10-30,2025-03-18,Los Angeles Dodgers,True,True
8114,724410,2023-09-10,2023-09-11,SC,Status Change,New York Yankees,Jasson Domínguez,691176,new york yankees placed cf jasson domínguez on...,right,...,Jasson Domínguez,new york yankees activated cf jasson domínguez...,2024-02-14,New York Yankees,2024-06-12,2023-11-01,2024-03-28,New York Yankees,True,True
8119,757574,2024-03-25,2024-03-28,SC,Status Change,Miami Marlins,Eury Pérez,691587,miami marlins placed rhp eury pérez on the 15-...,right,...,Eury Pérez,miami marlins activated rhp eury pérez from th...,2025-02-20,Miami Marlins,2025-06-09,2024-10-30,2025-03-27,Miami Marlins,True,True
8147,798071,2024-09-06,2024-09-06,SC,Status Change,Los Angeles Dodgers,Gavin Stone,694813,los angeles dodgers placed rhp gavin stone on ...,right,...,Gavin Stone,los angeles dodgers activated rhp gavin stone ...,2025-02-11,Los Angeles Dodgers,NaT,2024-10-30,2025-03-18,Los Angeles Dodgers,True,True


In [1248]:
def offseason_bridge(row):
    if row.offseason_activation & row.pre_od_placement:
        row.trans_id_return = row.next_il_trans_id
        row.return_team = row.next_il_return_tem
        row.effective_date_return = row.next_il_return_date
        row.descr_return = row.next_il_return_descr
        row.type_code_return = row.next_type_code_return
        return row
    else:
        return row

In [1246]:
injury.columns

Index(['trans_id', 'effective_date', 'date', 'typecode', 'typedesc', 'team_x',
       'player', 'player_id', 'descr', 'side', 'body_part', 'injury_type',
       'trans_id_return', 'effective_date_return', 'team_return',
       'player_return', 'descr_return', 'next_il_placement', 'next_il_team',
       'next_il_return_date', 'final_postseason', 'next_opening_day', 'team',
       'offseason_activation', 'pre_od_placement'],
      dtype='object')