In [1]:
import os

import pandas as pd

In [10]:
def load_discharge_table(path: str) -> pd.DataFrame:
    """
    Load discharge table with required columns:
    ['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq', 'charttime', 'storetime']
    """
    df = pd.read_csv(path, parse_dates=['charttime', 'storetime'])
    df = df.sort_values(['subject_id', 'storetime']).reset_index(drop=True)
    return df


def build_discharge_windows(discharge_df: pd.DataFrame) -> pd.DataFrame:
    """
    Given discharge table, create a dataframe of time windows per subject.
    Each window represents the time between consecutive storetime values.
    """
    discharge_df = discharge_df.copy()
    discharge_df['window_start'] = discharge_df.groupby('subject_id')['storetime'].shift(1)
    discharge_df['window_end'] = discharge_df['storetime']

    # The first discharge has no previous storetime, so window_start can be NaT or -inf
    discharge_df['window_start'] = discharge_df['window_start'].fillna(pd.Timestamp.min)
    return discharge_df[['subject_id', 'hadm_id', 'window_start', 'window_end']]


def extract_rows_in_windows(data_df: pd.DataFrame,
                            discharge_windows: pd.DataFrame,
                            time_col: str) -> pd.DataFrame:
    """
    Generic function to filter rows from data_df that fall within discharge windows.
    Arguments:
        data_df: dataframe with at least ['subject_id', time_col]
        discharge_windows: dataframe returned from build_discharge_windows
        time_col: name of timestamp column in data_df (must be datetime)
    Returns:
        DataFrame of rows joined to their corresponding discharge window
    """
    data_df = data_df.copy()
    data_df[time_col] = pd.to_datetime(data_df[time_col])

    results = []
    for subj, subj_windows in discharge_windows.groupby('subject_id'):
        subj_data = data_df[data_df['subject_id'] == subj]
        if subj_data.empty:
            continue

        for _, row in subj_windows.iterrows():
            mask = (subj_data[time_col] > row['window_start']) & (subj_data[time_col] <= row['window_end'])
            matched = subj_data.loc[mask].copy()
            if not matched.empty:
                matched['hadm_id_window'] = row['hadm_id']
                matched['window_start'] = row['window_start']
                matched['window_end'] = row['window_end']
                results.append(matched)

    if results:
        return pd.concat(results, ignore_index=True)
    return pd.DataFrame(columns=list(data_df.columns) + ['hadm_id_window', 'window_start', 'window_end'])


def concat_medications_per_discharge(filtered_df: pd.DataFrame) -> pd.DataFrame:
    """
    Given pharmacy rows already filtered into discharge windows, 
    return one row per discharge (hadm_id) with all medications concatenated.
    Assumes filtered_df has at least ['subject_id', 'hadm_id_window', 'medication'] columns.
    """
    # Keep only relevant columns
    cols = ['subject_id', 'hadm_id_window', 'medication']
    filtered_df = filtered_df[cols].copy()

    # Concatenate all medications per discharge
    grouped = (
        filtered_df
        .groupby(['subject_id', 'hadm_id_window'], dropna=False)
        .agg({'medication': lambda x: ', '.join(x.astype(str).unique())})
        .reset_index()
    )

    # Rename hadm_id_window back to hadm_id
    grouped = grouped.rename(columns={'hadm_id_window': 'hadm_id'})

    return grouped



In [11]:
# Load discharge data
discharge_df = load_discharge_table("data_samples/notes/discharge.csv")

# Build discharge windows
windows = build_discharge_windows(discharge_df)

# Load another table (e.g. pharmacy.csv)
pharmacy_df = pd.read_csv("data_samples/hosp/pharmacy.csv", parse_dates=['starttime'])

# Extract pharmacy rows that fall into each discharge window
pharmacy_in_windows = extract_rows_in_windows(pharmacy_df, windows, time_col='starttime')

# Optionally group by discharge
med_concat_per_discharge = concat_medications_per_discharge(pharmacy_in_windows)

med_concat_per_discharge


Unnamed: 0,subject_id,hadm_id,medication
0,10000032,22595853,"Furosemide, Ipratropium Bromide Neb, Potassium..."
1,10000032,22841357,"Furosemide, Rifaximin, Sodium Chloride 0.9% F..."
2,10000032,25742920,"Sodium Chloride 0.9% Flush, Calcium Gluconate..."
3,10000032,29079034,"Bisacodyl, Senna, Calcium Carbonate, Raltegrav..."
4,10000084,23052089,"Pramipexole, Pravastatin, rivastigmine, Senna,..."
5,10000117,22927623,"Heparin, Sodium Chloride 0.9% Flush"
6,10000117,27988844,"CeFAZolin, Vitamin D, Acetaminophen IV, Multiv..."
7,10000248,20600184,"OxycoDONE (Immediate Release) , Acetaminophen,..."
8,10000560,28979390,"Acetaminophen, Lorazepam, Metoclopramide, Onda..."
9,10000764,27897940,"Metoprolol Tartrate, Neutra-Phos, Amoxicillin-..."


In [12]:
discharge_df = load_discharge_table("data_samples/notes/discharge.csv")

# Build discharge windows
build_discharge_windows(discharge_df)

Unnamed: 0,subject_id,hadm_id,window_start,window_end
0,10000032,22595853,1677-09-21 00:12:43.145224193,2180-05-09 15:26:00
1,10000032,22841357,2180-05-09 15:26:00.000000000,2180-07-01 10:15:00
2,10000032,29079034,2180-07-01 10:15:00.000000000,2180-07-25 21:42:00
3,10000032,25742920,2180-07-25 21:42:00.000000000,2180-08-10 05:43:00
4,10000084,23052089,1677-09-21 00:12:43.145224193,2160-11-25 15:09:00
...,...,...,...,...
2495,10082560,23284776,2180-02-29 11:29:00.000000000,2180-05-07 14:44:00
2496,10082640,22930426,1677-09-21 00:12:43.145224193,2179-09-03 21:26:00
2497,10082640,20566971,2179-09-03 21:26:00.000000000,2182-05-25 13:41:00
2498,10082649,22738559,1677-09-21 00:12:43.145224193,2150-09-12 08:58:00


## Descrever, como foi feitas as janelas - como foi selecinadas os periodos e pq
## 