In [None]:
import pandas as pd
import joblib
import os

In [None]:

def get_comparison(database='mimic', 
                   has_microbiology=True, 
                   inc_ab=False, 
                   seed=42, 
                   lookback=2, 
                   prediction_time_points = 'random',
                   numberofsamples = 1,
                   sample_train=None, 
                   sample_test=None, 
                   model='LGBMClassifier', 
                   dropout=0.3, 
                   lamb=0.9, 
                   num_relu_layers=1,
                   is_tuned=True):
    
    if prediction_time_points == 'random':
        time_point = ('random', numberofsamples)

    traditional_path = 'data/model_input/traditional/'+database+'/microbiology_res_'+str(has_microbiology)+'/ab_'+str(inc_ab)+'/seed_'+str(seed)+'/'
    traditional_model_path = 'data/results/traditional/'+database+'/microbiology_res_'+str(has_microbiology)+'/ab_'+str(inc_ab)+'/seed_'+str(seed)+ \
                            '/lookback_'+str(lookback)+'/time_point'+str(time_point)+'/sample_'+str(sample_train)+"_"+str(sample_test)+"/"+model+"/"
    
    result_path = 'data/results/combined/'+database+'/microbiology_res_'+str(has_microbiology)+'/ab_'+str(inc_ab)+'/seed_'+str(seed)+'/'+ \
                  '/lookback_'+str(lookback)+'/time_point'+str(time_point)+'/sample_'+str(sample_train)+'_'+str(sample_test)+'/'+model+'/' + \
                  '/dropout_'+str(dropout).replace('.','-')+'/lambda_'+str(lamb).replace('.','-')+'/num_relu_layers_'+str(num_relu_layers)+ \
                  '/is_tuned_'+str(is_tuned)+'/'
    
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    #display(pd.read_parquet('data/model_input/traditional/'+database+'/microbiology_res_'+str(has_microbiology)+'/ab_'+str(inc_ab)+'/seed_'+str(seed)+'/'+'X_test_time_point_'+str(('random',1))+'_lookback_'+str(lookback)+'.parquet'))

    # first we get the complete test dataset for the traditional model
    X_traditional = pd.DataFrame()
    y_traditional = pd.DataFrame()
    for tp in [0,1,2,3,4]:
        X_part = pd.read_parquet(traditional_path+'X_test_time_point_'+str(tp)+'_lookback_'+str(lookback)+'.parquet')
        y_part = pd.read_parquet(traditional_path+'y_test_time_point_'+str(tp)+'_lookback_'+str(lookback)+'.parquet')

        X_traditional = pd.concat([X_traditional, X_part], axis=0, join='outer').fillna(0)
        y_traditional = pd.concat([y_traditional, y_part], axis=0, join='outer').fillna(0)

    #display(X_traditional)
    #display(y_traditional)

    # next we load the traditional model
    # save the trained estimator
    model = joblib.load(traditional_model_path+'model.pkl')

    # calculate test set predictions
    pred_test = pd.DataFrame(model.predict(X_traditional), columns=['pred'])
    pred_proba_test = pd.DataFrame(model.predict_proba(X_traditional), columns=['False','True'])

    #display(pred_test)
    #display(pred_proba_test)

    y_traditional = pd.concat([y_traditional.reset_index(drop=True), pred_test, pred_proba_test], axis=1)
    #display(y_traditional.groupby(['days_past']).count())

    #display(y_traditional.sort_values(['ID','days_past']))
    # get the predicitions from the next day model
    y_next_day = pd.read_csv('data/results/lstm/'+database+'/microbiology_res_'+str(has_microbiology)+'/ab_'+str(inc_ab)+'/seed_'+str(seed)+
                             '/dropout_'+str(dropout).replace('.','-')+'/lambda_'+str(lamb).replace('.','-')+'/num_relu_layers_'+str(num_relu_layers)+
                             '/is_tuned_'+str(is_tuned)+'/test_gt_and_preds.csv')
    
    y_next_day['starttime'] = pd.to_datetime(y_next_day['starttime'])
    y_next_day['starttime'] = y_next_day['starttime'].dt.floor('5min')
    #display(y_next_day.sort_values(['ID','days_past']))

    #display(y_next_day.groupby(['days_past']).count())

    combined = y_traditional.merge(y_next_day, on=['days_past', 'ID', 'starttime'], suffixes=('_traditional', '_lstm'))

    # pred_traditional True => stop therapy
    # pred_traditional False => complete therapy
    # pred_lstm True => complete therapy
    # pred_lstm False => stop therapy
    combined = combined[['ID','starttime','days_past','lot<5d','pred_traditional','pred_lstm','lot_in_days']]

    combined['gt_continue'] = ~combined['lot<5d']

    combined['trad_continue'] = ~combined['pred_traditional']

    combined['next_day_continue'] = combined['pred_lstm'].astype(bool)

    counts = combined.groupby(['ID', 'starttime'])['days_past'].nunique()

    # Filtern Sie die Zeilen, in denen alle fünf Einträge für 'days_past' vorhanden sind
    combined = combined[combined.apply(lambda row: counts.get((row['ID'], row['starttime']), 0) == 5, axis=1)]

    return combined

combined = get_comparison()

In [None]:
combined[combined['lot_in_days'] < 5]

In [None]:
data = {'ID': [1, 1, 1, 1, 1, 2, 2, 2, 2],
        'starttime': ['2023-10-01', '2023-10-01', '2023-10-01', '2023-10-01', '2023-10-01',
                      '2023-10-02', '2023-10-02', '2023-10-02', '2023-10-02'],
        'days_past': [0, 1, 2, 3, 4, 0, 1, 2, 3]}

df = pd.DataFrame(data)
display(df)

counts = df.groupby(['ID', 'starttime'])['days_past'].nunique()

filtered_df = df[df.apply(lambda row: counts.get((row['ID'], row['starttime']), 0) == 5, axis=1)]

display(filtered_df)