In [1]:
import numpy as np
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

load env file
  root dir:
    /home/k/Repo/IBD-EDA
  current system:
    Linux
  load .env.linux
  loaded data dir:
    /home/k/Nutstore Files/毕设-EHR/DB
done.
set matplotlib configs
  font family:
    ['Times New Roman']
done.
DATA_DIR: /home/k/Nutstore Files/毕设-EHR/DB


In [3]:
with open('../data/ibd_demo.json', 'r') as f:
    data = json.loads(f.read())
    
both_ibd_patients: list = data['both_ibd']
only_uc_patiens: list = data['only_uc']
only_cd_patients: list = data['only_cd']


df = pd.read_csv(os.path.join(DATA_DIR, 'omr', 'ibd_omr.csv'))
df = df[df['result_name'].isin(['Blood Pressure', 'Weight (Lbs)', 'BMI (kg/m2)', 'Height (Inches)'])]

# Blood Pressure

In [7]:
patient_list = both_ibd_patients

sub_df = df[df['subject_id'].isin(patient_list) & (df['result_name'] == 'Blood Pressure')]

sub_df.head()

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
28,10303503,2144-05-09,1,Blood Pressure,106/64
29,10303503,2145-03-23,1,Blood Pressure,120/60
32,10303503,2145-07-19,1,Blood Pressure,130/60
34,10303503,2145-11-20,1,Blood Pressure,110/60
35,10303503,2146-02-27,1,Blood Pressure,109/60


In [8]:
def handle_blood_pressure(result_value: str, low: bool = True) -> (int, int):
    if low:
        return int(result_value.split('/')[1])
    else:
        return int(result_value.split('/')[0])
    

In [9]:
sub_df['low'] = sub_df['result_value'].apply(handle_blood_pressure, low=True)
sub_df['high'] = sub_df['result_value'].apply(handle_blood_pressure, low=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['low'] = sub_df['result_value'].apply(handle_blood_pressure, low=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['high'] = sub_df['result_value'].apply(handle_blood_pressure, low=False)


In [10]:
stats = sub_df.groupby('subject_id').agg({
    'low': ['mean', 'min', 'max'],
    'high': ['mean', 'min', 'max'],
})

stats.head()

Unnamed: 0_level_0,low,low,low,high,high,high
Unnamed: 0_level_1,mean,min,max,mean,min,max
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
10149624,64.904762,48,72,106.047619,96,126
10303503,65.604651,49,80,107.023256,83,130
10767777,60.666667,52,71,106.666667,100,114
10789934,93.0,93,93,139.0,139,139
10814123,73.833333,65,81,122.0,116,130


In [11]:
stats['low']['mean'].describe()

count    68.000000
mean     71.770873
std       7.687824
min      50.000000
25%      66.749554
50%      72.222222
75%      77.143068
max      93.000000
Name: mean, dtype: float64

In [12]:
stats['high']['mean'].describe()

count     68.000000
mean     121.429488
std       12.521672
min       96.166667
25%      112.419580
50%      120.155172
75%      129.812500
max      147.230769
Name: mean, dtype: float64

# BMI

In [13]:
patient_list = only_uc_patiens

sub_df = df[df['subject_id'].isin(patient_list) & (df['result_name'] == 'BMI (kg/m2)')]
sub_df['result_value'] = sub_df['result_value'].apply(float)

sub_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['result_value'] = sub_df['result_value'].apply(float)


Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
81241,10030412,2144-07-06,1,BMI (kg/m2),20.9
81249,10865431,2158-10-03,1,BMI (kg/m2),29.0
81254,10865431,2159-06-12,1,BMI (kg/m2),34.3
81269,13376182,2122-03-10,1,BMI (kg/m2),19.4
81278,13376182,2123-01-12,1,BMI (kg/m2),20.3


In [14]:
stats = sub_df.groupby('subject_id').agg({
    'result_value': ['mean', 'min', 'max']
})

stats.head()

Unnamed: 0_level_0,result_value,result_value,result_value
Unnamed: 0_level_1,mean,min,max
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
10018852,26.0,26.0,26.0
10024331,31.755556,29.3,43.2
10030412,20.9,20.9,20.9
10048262,26.2,24.2,35.0
10056223,32.79697,31.1,34.7


In [15]:
stats['result_value']['mean'].describe()

count    703.000000
mean      27.939979
std       20.079003
min       10.800000
25%       23.139167
50%       26.141667
75%       30.100000
max      522.150000
Name: mean, dtype: float64

# Weight

In [16]:
patient_list = both_ibd_patients + only_cd_patients + only_uc_patiens

sub_df = df[df['subject_id'].isin(patient_list) & (df['result_name'] == 'Weight (Lbs)')]
sub_df['result_value'] = sub_df['result_value'].apply(float)

sub_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['result_value'] = sub_df['result_value'].apply(float)


Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
1,10098672,2140-04-18,1,Weight (Lbs),156.0
5,10098672,2140-11-14,1,Weight (Lbs),135.0
8,10098672,2140-12-01,1,Weight (Lbs),159.0
11,10098672,2141-01-05,1,Weight (Lbs),170.0
15,10098672,2141-04-13,1,Weight (Lbs),164.0


In [17]:
stats = sub_df.groupby('subject_id').agg({
    'result_value': ['mean', 'min', 'max']
})

stats.head()

Unnamed: 0_level_0,result_value,result_value,result_value
Unnamed: 0_level_1,mean,min,max
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
10007174,166.995,164.68,169.31
10018852,178.4,160.0,192.0
10024331,248.115,216.2,310.0
10025647,171.0,167.0,175.0
10027407,185.0,185.0,185.0


In [18]:
stats['result_value']['mean'].describe()

count    1790.000000
mean      167.464320
std        59.333911
min        75.600000
25%       137.152745
50%       160.000000
75%       189.107302
max      1818.361667
Name: mean, dtype: float64

In [19]:
df.subject_id.value_counts()

subject_id
18932584    1508
14876256    1005
18753333     965
10711182     868
16658776     849
            ... 
14235312       1
19204296       1
14814589       1
13043470       1
12070622       1
Name: count, Length: 1856, dtype: int64