In [None]:
import numpy as np
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

In [None]:
with open('../data/ibd_demo.json', 'r') as f:
    data = json.loads(f.read())
    
both_ibd_patients: list = data['both_ibd']
only_uc_patients: list = data['only_uc']
only_cd_patients: list = data['only_cd']

# Admission LOS

In [None]:
df = pd.read_csv(os.path.join(DATA_DIR, 'outcome', 'admission_stay.csv'))

In [None]:
for _column in ['admittime', 'dischtime', 'edregtime', 'edouttime']:
    # 入院 出院 进入Ed 离开Ed
    df[_column] = pd.to_datetime(df[_column])
    
# for i ,_anchor_year in enumerate(df['anchor_year']):
#     df.loc[i, 'anchor_year'] = pd.to_datetime(datetime(year=_anchor_year, month=1, day=1))

df.head(2)

In [None]:
df.groupby('subject_id').agg({
    'anchor_year': 'first',
    'admittime': 'first',
    'dischtime': 'first'
})

In [None]:
df['los_ad'] = df['dischtime'] - df['admittime']
df['los_dp'] = df['edouttime'] - df['edregtime']

df.head()

In [None]:
stats = df.groupby('subject_id').agg({
    'los_ad': ['mean', 'min', 'max', 'count'],
    'los_dp': ['mean', 'min', 'max', 'count']
})

stats.head()

In [None]:
data = stats['los_ad']['mean']

data = np.array(data)

data = data.astype(np.int64) / 1e9

data = leave_percentile(data)

In [None]:
distribution_analysis(data, bins=30)

In [None]:
stats[data > 1e6]

In [None]:
stats[stats.index.isin(both_ibd_patients)]

# ICU LOS

In [None]:
df = pd.read_csv(os.path.join(DATA_DIR, 'outcome', 'icu_stay.csv'))

df = df.drop_duplicates(subset=['subject_id', 'stay_id']).dropna(subset=['stay_id'])

df.head()

In [None]:
stats = df.groupby(['subject_id']).agg({
    'stay_id': 'first',
    'los': 'first', 
    'dod': 'first'
})

stats.head()

In [None]:
stats[stats.index.isin(both_ibd_patients)].shape

In [None]:
data = np.array(stats.los)

# data = leave_percentile(data)


In [None]:
distribution_analysis(data, bins=20)