In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis
from utils_statistics import construct_chi2_test, construct_fisher_test
from utils_complications import batch_query_by_subject_id

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  load .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.
set matplotlib configs
  font family:
    ['Times New Roman']
done.
DATA_DIR: /Users/k/Nutstore Files/毕设-EHR/DB


In [2]:
df_ibd = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_no_ibd = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients_NoIBD.csv'))
df_no_ibd.head(10)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,22595853,1,5723,9,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000032,22595853,2,78959,9,10000032,F,52,2180,2014 - 2016,2180-09-09
2,10000032,22595853,3,5715,9,10000032,F,52,2180,2014 - 2016,2180-09-09
3,10000032,22595853,4,07070,9,10000032,F,52,2180,2014 - 2016,2180-09-09
4,10000032,22595853,5,496,9,10000032,F,52,2180,2014 - 2016,2180-09-09
5,10000032,22595853,6,29680,9,10000032,F,52,2180,2014 - 2016,2180-09-09
6,10000032,22595853,7,30981,9,10000032,F,52,2180,2014 - 2016,2180-09-09
7,10000032,22595853,8,V1582,9,10000032,F,52,2180,2014 - 2016,2180-09-09
8,10000032,22841357,1,07071,9,10000032,F,52,2180,2014 - 2016,2180-09-09
9,10000032,22841357,2,78959,9,10000032,F,52,2180,2014 - 2016,2180-09-09


In [3]:
stats = df_ibd.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'anchor_year': 'first',
    'dod': 'first',
    'seq_num': ['count'],
    # 'icd_code': 'unique'
})

live = stats[stats['dod']['first'].isnull()]
dead = stats[stats['dod']['first'].notnull()]
print(live.shape[0], dead.shape[0])

live_df = batch_query_by_subject_id(_df=df_ibd, _subject_ids=live.index.to_list())
dead_df = batch_query_by_subject_id(_df=df_ibd, _subject_ids=dead.index.to_list())

2037 380


In [4]:
suspect_icd_codes = ['2724', '27651', '2859', '30000', '311', '4019', '53081', '5849', 'V1582']

for suspect_icd_code in suspect_icd_codes:
    print('--> Suspect ICD Code: {}'.format(suspect_icd_code))
    pvalue1 = construct_chi2_test(live_df, dead_df, suspect_icd_code, silent=True)
    pvalue2 = construct_fisher_test(live_df, dead_df, suspect_icd_code, silent=True)

    if pvalue1 <= 0.05 or pvalue2 <= 0.05:
        print('  chi2 test pvalue:\n    {:.8f}'.format(pvalue1))
        print('  fisher test pvalue:\n    {:.8f}'.format(pvalue2))
        pvalue1 = construct_chi2_test(live_df, dead_df, suspect_icd_code, silent=False)
        pvalue2 = construct_fisher_test(live_df, dead_df, suspect_icd_code, silent=False)

    
    # break

--> Suspect ICD Code: 2724
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    329.0000    1708.0000
    162.0000    218.0000
  expected:
    413.8051    1623.1949
    77.1949    302.8051
  chi2: 137.0962
  pvalue: 0.0000
  dof: 1.0000
done.
start fisher test
  observed:
    329.0000    1708.0000
    162.0000    218.0000
  oddsratio: 0.2592
  pvalue: 0.0000
done.
--> Suspect ICD Code: 27651
  chi2 test pvalue:
    0.00000003
  fisher test pvalue:
    0.00000009
start chi2 test
  observed:
    307.0000    1730.0000
    102.0000    278.0000
  expected:
    344.6971    1692.3029
    64.3029    315.6971
  chi2: 30.7318
  pvalue: 0.0000
  dof: 1.0000
done.
start fisher test
  observed:
    307.0000    1730.0000
    102.0000    278.0000
  oddsratio: 0.4837
  pvalue: 0.0000
done.
--> Suspect ICD Code: 2859
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    377.0000    1660.0000
    146.000

In [5]:
not_suspect_icd_code = list(set(df_ibd['icd_code'].unique().tolist()) - set(suspect_icd_codes))[:]

for not_suspect_icd_code in not_suspect_icd_code:
    print('--> Not Suspect ICD Code: {}'.format(not_suspect_icd_code))
    pvalue1 = construct_chi2_test(live_df, dead_df, not_suspect_icd_code, silent=True)
    pvalue2 = construct_fisher_test(live_df, dead_df, not_suspect_icd_code, silent=True)
    
    if pvalue1 <= 0.01 or pvalue2 <= 0.01:
        print('  chi2 test pvalue:\n    {:.8f}'.format(pvalue1))
        print('  fisher test pvalue:\n    {:.8f}'.format(pvalue2))
        
        pvalue1 = construct_chi2_test(live_df, dead_df, not_suspect_icd_code, silent=False)
        pvalue2 = construct_fisher_test(live_df, dead_df, not_suspect_icd_code, silent=False)
    
    # break

--> Not Suspect ICD Code: 24290
  chi2 test pvalue:
    0.00024471
  fisher test pvalue:
    0.00086980
start chi2 test
  observed:
    5.0000    2032.0000
    7.0000    373.0000
  expected:
    10.1134    2026.8866
    1.8866    378.1134
  chi2: 13.4522
  pvalue: 0.0002
  dof: 1.0000
done.
start fisher test
  observed:
    5.0000    2032.0000
    7.0000    373.0000
  oddsratio: 0.1311
  pvalue: 0.0009
done.
--> Not Suspect ICD Code: 43301
--> Not Suspect ICD Code: M62838
--> Not Suspect ICD Code: 42789
  chi2 test pvalue:
    0.00219393
  fisher test pvalue:
    0.00250836
start chi2 test
  observed:
    157.0000    1880.0000
    48.0000    332.0000
  expected:
    172.7700    1864.2300
    32.2300    347.7700
  chi2: 9.3798
  pvalue: 0.0022
  dof: 1.0000
done.
start fisher test
  observed:
    157.0000    1880.0000
    48.0000    332.0000
  oddsratio: 0.5776
  pvalue: 0.0025
done.
--> Not Suspect ICD Code: E1136
--> Not Suspect ICD Code: 73734
--> Not Suspect ICD Code: 9998
--> Not S

# df_ibd and df_no_ibd

In [6]:
suspect_icd_codes = ['2724', '27651', '2859', '30000', '311', '4019', '53081', '5849', 'V1582']

for suspect_icd_code in suspect_icd_codes:
    pvalue1 = construct_chi2_test(df_ibd, df_no_ibd, suspect_icd_code, silent=True)
    pvalue2 = construct_fisher_test(df_ibd, df_no_ibd, suspect_icd_code, silent=True)

    if pvalue1 <= 0.005 or pvalue2 <= 0.05:
        print('--> Suspect ICD Code: {}'.format(suspect_icd_code))
        print('  chi2 test pvalue:\n    {:.8f}'.format(pvalue1))
        print('  fisher test pvalue:\n    {:.8f}'.format(pvalue2))
        
        pvalue1 = construct_chi2_test(df_ibd, df_no_ibd, suspect_icd_code, silent=False)
        pvalue2 = construct_fisher_test(df_ibd, df_no_ibd, suspect_icd_code, silent=False)

    
    # break

--> Suspect ICD Code: 2724
  chi2 test pvalue:
    0.02564857
  fisher test pvalue:
    0.02512313
start chi2 test
  observed:
    491.0000    1926.0000
    33448.0000    147179.0000
  expected:
    448.1467    1968.8533
    33490.8533    147136.1467
  chi2: 4.9796
  pvalue: 0.0256
  dof: 1.0000
done.
start fisher test
  observed:
    491.0000    1926.0000
    33448.0000    147179.0000
  oddsratio: 1.1218
  pvalue: 0.0251
done.
--> Suspect ICD Code: 27651
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    409.0000    2008.0000
    8355.0000    172272.0000
  expected:
    115.7240    2301.2760
    8648.2760    171978.7240
  chi2: 788.3664
  pvalue: 0.0000
  dof: 1.0000
done.
start fisher test
  observed:
    409.0000    2008.0000
    8355.0000    172272.0000
  oddsratio: 4.1998
  pvalue: 0.0000
done.
--> Suspect ICD Code: 2859
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    523.0

In [7]:
not_suspect_icd_code = list(
        set(df_no_ibd['icd_code'].unique().tolist()) | \
            set(df_ibd['icd_code'].unique().tolist()) - \
                set(suspect_icd_codes)
    )[:]

for not_suspect_icd_code in not_suspect_icd_code:
    print('--> Not Suspect ICD Code: {}'.format(not_suspect_icd_code))
    pvalue1 = construct_chi2_test(df_ibd, df_no_ibd, not_suspect_icd_code, silent=True)
    pvalue2 = construct_fisher_test(df_ibd, df_no_ibd, not_suspect_icd_code, silent=True)
    
    if pvalue1 <= 0.01 or pvalue2 <= 0.01:
        print('  chi2 test pvalue:\n    {:.8f}'.format(pvalue1))
        print('  fisher test pvalue:\n    {:.8f}'.format(pvalue2))
        
        pvalue1 = construct_chi2_test(df_ibd, df_no_ibd, not_suspect_icd_code, silent=False)
        pvalue2 = construct_fisher_test(df_ibd, df_no_ibd, not_suspect_icd_code, silent=False)
    
    # break

--> Not Suspect ICD Code: I6622
--> Not Suspect ICD Code: S68623S
--> Not Suspect ICD Code: 24290
--> Not Suspect ICD Code: 43301
--> Not Suspect ICD Code: M62838
--> Not Suspect ICD Code: Q661
--> Not Suspect ICD Code: 42789
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    205.0000    2212.0000
    7404.0000    173223.0000
  expected:
    100.4729    2316.5271
    7508.5271    173118.4729
  chi2: 113.8824
  pvalue: 0.0000
  dof: 1.0000
done.
start fisher test
  observed:
    205.0000    2212.0000
    7404.0000    173223.0000
  oddsratio: 2.1682
  pvalue: 0.0000
done.
--> Not Suspect ICD Code: T82128A
--> Not Suspect ICD Code: V189XXA
--> Not Suspect ICD Code: 36922
--> Not Suspect ICD Code: 24900
  chi2 test pvalue:
    0.00000000
  fisher test pvalue:
    0.00000000
start chi2 test
  observed:
    25.0000    2392.0000
    422.0000    180205.0000
  expected:
    5.9024    2411.0976
    441.0976    180185.9024
  chi2: 59.5278
  pva

KeyboardInterrupt: 