# Hypercapnia NEW DATA - RFS Processing

### #############################################################################################

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import psutil

database_dir = r"D:\TriNetX\Diagnosis\\"   # Location where the database files are stored 
#working_dir = r"D:\TriNetX\Diagnosis\"
working_dir = r"C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\\" #location where to read and right from (faster = better if space allows)
print(working_dir)

C:\Users\reblo\Box\Residency Personal Files\Scholarly Work\Locke Research Projects\TriNetX Code\Hypercapnia TriNetX CSV Processing\Working\\


# RFS: ABG; Processing
### LOINC: 2019-8 - Arterial CO2
### LOINC: 2026-3 - Arterial CO2
### LOINC: 32771-8 - Arterial CO2

In [None]:
### LOINC 2019-8 Arterial CO2
### LOINC 2026-3 Arterial CO2
### LOINC 32771-8 Arterial CO2

num_spreadsheets = 2334
d_type_spec = {"patient_id":str, "encounter_id":str, "code":'category'}

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    lab_results = pd.read_csv(working_dir + "lab_results_NEW_"+f'{i:04}'+".csv", dtype = d_type_spec)
    lab_results= lab_results[lab_results["code"].str.match("^2019-8$|^2026-3$|^32771-8$")]
    lab_results["lab_result_num_val"] = lab_results["lab_result_num_val"].astype('float16') # to save memory; shouldn't ahve any values outside 32k
    lab_results = lab_results.loc[(lab_results["lab_result_num_val"] < 200) & (lab_results["lab_result_num_val"] > 5)]
    sub_blocks.append(lab_results)
    del lab_results

RFS_ABG = pd.concat(sub_blocks, ignore_index = True)
RFS_ABG.to_csv(working_dir +"RFS_ABG.csv", index = False)
display(RFS_ABG.head())
print("\n")
print(RFS_ABG.shape)
print("\n")
print(RFS_ABG["code"].value_counts())

##### Histogram of RFS_ABG Value Spread #####
plt.figure(figsize = (10,10))
plt.hist(RFS_ABG["lab_result_num_val"], bins = 100, facecolor = "blue")
plt.title("Histogram of RFS_ABG", color = "black", fontsize = 12)
plt.xlabel("Value", fontsize = 12)
plt.ylabel("Count", fontsize = 12)
plt.show()

del RFS_ABG


### #############################################################################################

# RFS: VBG; Processing
### LOINC: 115577-6 CO2 in blood
### LOINC: 2021-4 CO2 in venous blood

In [None]:
### LOINC: 115577-6 CO2 in blood
### LOINC: 2021-4 CO2 in venous blood

num_spreadsheets = 2334
d_type_spec = {"patient_id":str, "encounter_id":str, "code":'category'}

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    lab_results = pd.read_csv(working_dir + "lab_results_NEW_"+f'{i:04}'+".csv", dtype = d_type_spec)
    lab_results= lab_results[lab_results["code"].str.match("^11557-6$|^2021-4$")]
    lab_results["lab_result_num_val"] = lab_results["lab_result_num_val"].astype('float16') # to save memory; shouldn't ahve any values outside 32k
    lab_results = lab_results.loc[(lab_results["lab_result_num_val"] < 200) & (lab_results["lab_result_num_val"] > 5)]
    sub_blocks.append(lab_results)
    del lab_results

RFS_VBG = pd.concat(sub_blocks, ignore_index = True)
RFS_VBG.to_csv(working_dir +"RFS_VBG.csv", index = False)
display(RFS_VBG.head())
print("\n")
print(RFS_VBG.shape)
print("\n")
print(RFS_VBG["code"].value_counts())

##### Histogram of RFS_VBG #####
plt.figure(figsize = (10,10))
plt.hist(RFS_VBG["lab_result_num_val"], bins = 100, facecolor = "blue")
plt.title("Histogram of RFS_VBG", color = "black", fontsize = 12)
plt.xlabel("Value", fontsize = 12)
plt.ylabel("Count", fontsize = 12)
plt.show()
del RFS_VBG


### #############################################################################################

# RFS: Resp Failure Dx; Processing
### ICD-10: J96.*
### ICD-10: E66.2 Morbid obesity with alveolar hypoventilation

In [None]:
### ICD-10: J96.* - Any Respiratory failure dx code; e.g. including J96.00, J96.10, J96.92, etc.
### ICD-10: E66.2 - Morbid obesity with alveolar hypoventilation

num_spreadsheets = 1273
d_type_spec={"patient_id":str, 
               "encounter_id":str, 
               "code":'category', 
               "principal_diagnosis_indicator":'category',
               "admitting_diagnosis":'category',
               "reason_for_visit":'category', 
               "date":'category'}

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    diagnosis = pd.read_csv(working_dir + "diagnosis_NEW_"+f'{i:04}'+".csv", 
        dtype = d_type_spec)
    diagnosis = diagnosis[diagnosis["code"].str.match("^J96.*|^E66.2$")]
    sub_blocks.append(diagnosis)
    del diagnosis

RFS_RESPFAIL = pd.concat(sub_blocks, ignore_index = True)
RFS_RESPFAIL.to_csv(working_dir +"RFS_RESPFAIL.csv", index = False)
display(RFS_RESPFAIL.head())
print("\n")
print(RFS_RESPFAIL.shape)
print("\n")
print(RFS_RESPFAIL["code"].value_counts())
del RFS_RESPFAIL


### #############################################################################################

# RFS: Obesity; Processing
### ICD-10: E66.01 - Morbid Obesity due to excess calories
### ICD-10: Z68.41 - BMI 40.0-44.9 dx code
### ICD-10: Z68.42 - BMI 45-49.9 dx code
### LOINC: 39156-5 - Body Mass Index greater than or equal to 40

In [None]:
### ICD-10: E66.01 - Morbid Obesity due to excess calories
### ICD-10: Z68.41 - BMI 40.0-44.9 dx code
### ICD-10: Z68.42 - BMI 45-49.9 dx code
### LOINC: 39156-5 - Body Mass Index greater than or equal to 40

num_dx_spreadsheets = 1273
dx_d_type_spec={"patient_id":str, 
               "encounter_id":str, 
               "code":'category', 
               "principal_diagnosis_indicator":'category',
               "admitting_diagnosis":'category',
               "reason_for_visit":'category', 
               "date":'category'}

num_vs_spreadsheets = 853
vs_d_type_spec={"patient_id":str, 
                "encounter_id":str, 
                "code":'category', 
                "date":'category'}

sub_blocks = []
for i in range(1, num_dx_spreadsheets+1):
    print(f'{i:04}')
    diagnosis = pd.read_csv(working_dir + "diagnosis_NEW_"+f'{i:04}'+".csv", 
        dtype = dx_d_type_spec)
    diagnosis = diagnosis[diagnosis["code"].str.match("^E66.01$|^Z68.41$|^Z68.42$")]
    sub_blocks.append(diagnosis)
    del diagnosis

obesity_dx = pd.concat(sub_blocks, ignore_index = True)
print(obesity_dx["code"].value_counts())
print("\n")

sub_blocks = []
for i in range(1, num_vs_spreadsheets+1):
    print(f'{i:04}')
    vital_signs = pd.read_csv(working_dir + "vital_signs_NEW_"+f'{i:04}'+".csv", 
                              dtype=vs_d_type_spec)
    vital_signs = vital_signs[vital_signs["code"].str.match("^39156-5$")]
    vital_signs['value'] = vital_signs['value'].astype('float16') # to save memory; shouldn't ahve any values outside 32k
    vital_signs = vital_signs.loc[(vital_signs['value'] < 100) & (vital_signs['value'] >= 40)] #only include BMI over 40
    sub_blocks.append(vital_signs)
    del vital_signs

obesity_vitals = pd.concat(sub_blocks, ignore_index = True)
print(obesity_vitals["code"].value_counts())
print("\n")

RFS_OBESITY = pd.concat([obesity_dx, obesity_vitals], ignore_index = True)
RFS_OBESITY.to_csv(working_dir +"RFS_OBESITY.csv", index = False)
display(RFS_OBESITY.head())
print("\n")
print(RFS_OBESITY.shape)
print("\n")
print(RFS_OBESITY["code"].value_counts())



### #############################################################################################

# RFS: Vent Support
### ICD-PCS: 5A09459 - Assistance with respiratory ventilation negative airway pressure
### ICD-PCS: 5A0945B - Assistance with respiratory ventilation intermittent negative airway pressure
### ICD-PCS: 5A09559 - Assistance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A0955B - Asstiance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A09359 - Asstiance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A0935B - Asstiance with respiratory ventilation intermittent neg airway pressure
### ICD-PCS: 5A09358 - Intermittant CPAP 24
### ICD-PCS: 5A09458 - Intermittant CPAP 24-96
### ICD-PCS: 5A09558 - Intermittant CPAP 96+
### ICD-PCS: 5A09357 - Assistance with Respiratory Ventilation, Less than 24 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A09457 - Assistance with Respiratory Ventilation, 24-96 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A09557 - Assistance with Respiratory Ventilation, Greater than 96 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A0935Z - Assistance with respiratory ventilation less than 24 consecutive hours 
### ICD-PCS: 5A0945Z - Assistance with respiratory ventilation 24-96 consecutive hours 
### ICD-PCS: 5A0955Z - Assistance with respiratory ventilation greater than 96 consecutive hours 
### ICD-PCS: 5A1945Z - Respiratory Ventilation
### ICD-PCS: 5A1935Z - Resp Vent <24
### ICD-PCS: 5A1945Z - Resp Vent 24-96
### ICD-PCS: 5A1955Z - Resp Vent 96+
### CPT: 1015098 - Vent Management
### CPT: 1014859 - Vent assist and management
### CPT: 94002 - Vent assist and management, controlled
### CPT: 94003 - Vent assist and management, controlled subsequent
### CPT: 94660 - CPAP initiation and management

In [None]:
# RFS: Vent Support
### ICD-PCS: 5A09459 - Assistance with respiratory ventilation negative airway pressure
### ICD-PCS: 5A0945B - Assistance with respiratory ventilation intermittent negative airway pressure
### ICD-PCS: 5A09559 - Assistance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A0955B - Asstiance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A09359 - Asstiance with respiratory ventilation continuous neg airway pressure
### ICD-PCS: 5A0935B - Asstiance with respiratory ventilation intermittent neg airway pressure
### ICD-PCS: 5A09358 - Intermittant CPAP 24
### ICD-PCS: 5A09458 - Intermittant CPAP 24-96
### ICD-PCS: 5A09558 - Intermittant CPAP 96+
### ICD-PCS: 5A09357 - Assistance with Respiratory Ventilation, Less than 24 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A09457 - Assistance with Respiratory Ventilation, 24-96 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A09557 - Assistance with Respiratory Ventilation, Greater than 96 Consecutive Hours, Continuous Positive Airway Pressure
### ICD-PCS: 5A0935Z - Assistance with respiratory ventilation less than 24 consecutive hours 
### ICD-PCS: 5A0945Z - Assistance with respiratory ventilation 24-96 consecutive hours 
### ICD-PCS: 5A0955Z - Assistance with respiratory ventilation greater than 96 consecutive hours 
### ICD-PCS: 5A1945Z - Respiratory Ventilation
### ICD-PCS: 5A1935Z - Resp Vent <24
### ICD-PCS: 5A1945Z - Resp Vent 24-96
### ICD-PCS: 5A1955Z - Resp Vent 96+
### CPT: 1015098 - Vent Management
### CPT: 1014859 - Vent assist and management
### CPT: 94002 - Vent assist and management, controlled
### CPT: 94003 - Vent assist and management, controlled subsequent
### CPT: 94660 - CPAP initiation and management

num_spreadsheets = 714
d_type_spec = {"patient_id":str, 
               "encounter_id":str, 
               "code":'category', 
               "date":'category'}
match_str = "^5A09459$|^5A0945B$|^5A09559$|^5A0955B$|^5A09359$|^5A0935B$|^5A09358$|^5A09458$|^5A09558$|^5A09357$|^5A09457$|^5A09557$|^5A0935Z$|^5A0945Z$|^5A0955Z$|^5A1945Z$|^5A1935Z$|^5A1945Z$|^5A1955Z$|^1015098$|^1014859$|^94002$|^94003$|^94660$"

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    procedure = pd.read_csv(working_dir + "procedure_NEW_"+f'{i:04}'+".csv", 
        dtype = d_type_spec) 
    procedure = procedure[procedure["code"].str.match(match_str)]
    sub_blocks.append(procedure)
    del procedure

RFS_VENTSUPPORT = pd.concat(sub_blocks, ignore_index = True)
RFS_VENTSUPPORT.to_csv(working_dir +"RFS_VENTSUPPORT.csv", index = False)
print(RFS_VENTSUPPORT.shape)
print("\n")
print(RFS_VENTSUPPORT["code"].value_counts())
del RFS_VENTSUPPORT


### #############################################################################################

# RFS: Predisposition 
### Note: For all diagnoses, include sub-categories (e.g. I27.1* rather than I27.1 only)
### ICD-10: I27.1 - Kyphoscoliotic heart disease
### ICD-10: I27.9 - Pulmonary Heart disease
### ICD-10: I27.81 - Cor Pulmonale (chronic)
### ICD-10: I27.2 - Other secondary pulmonary hypertension
### ICD-10: G47.3 - Sleep Apnea
### ICD-10: G95 - Other and unspecified diseases of spinal cord
### ICD-10: G71 - Primary disorders of muscles
### ICD-10: G35 - Demyelinating diseases of the central nervous system
### ICD-10: G36 - Demyelinating diseases of the central nervous system
### ICD-10: G37 - Demyelinating diseases of the central nervous system
### ICD-10: G70 - Myasthenia Gravis
### ICD-10: G12.21 - Amyotrophic lateral sclerosis
### ICD-10: S14.101 - Unspecified injury at C1 level of cervical spinal cord
### ICD-10: S14.102 - Unspecified injury at C2 level of cervical spinal cord
### ICD-10: S14.103 - Unspecified injury at C3 level of cervical spinal cord
### ICD-10: S14.104 - Unspecified injury at C4 level of cervical spinal cord
### ICD-10: S14.105 - Unspecified injury at C5 level of cervical spinal cord
### ICD-10: S14.106 - Unspecified injury at C6 level of cervical spinal cord
### ICD-10: S14.107 - Unspecified injury at C7 level of cervical spinal cord
### ICD-10: S14.15 - Other incomplete lesions of the spinal cord
### ICD-10: S14.12 - Central cord syndrome of cervical spinal cord
### ICD-10: S14.109 - Unspecified injury at unspecified level of cervical spinal cord
### ICD-10: S14.10 - Unspecified injury at unspecified level of cervical spinal cord
### ICD-10: S14.1 - Other and Unspecified Injuries of the spinal cord
### ICD-10: D75.1 - Secondary Polycythemia
### ICD-10: F11 - Opioid related disorders
### ICD-10: T40 - Poisoning by narcotics and psychedleics
### ICD-10: E84 - Cystic fibrosis
### ICD-10: J45 - Asthma
### ICD-10: J44 - COPD
### ICD-10: J43 - Emphysema
### ICD-10: I50 - Heart failure

In [2]:
### Note: For all diagnoses, include sub-categories (e.g. I27.1* rather than I27.1 only)
### ICD-10: I27.1 - Kyphoscoliotic heart disease
### ICD-10: I27.9 - Pulmonary Heart disease
### ICD-10: I27.81 - Cor Pulmonale (chronic)
### ICD-10: I27.2 - Other secondary pulmonary hypertension
### ICD-10: G47.3 - Sleep Apnea
### ICD-10: G95 - Other and unspecified diseases of spinal cord
### ICD-10: G71 - Primary disorders of muscles
### ICD-10: G35 - Demyelinating diseases of the central nervous system
### ICD-10: G36 - Demyelinating diseases of the central nervous system
### ICD-10: G37 - Demyelinating diseases of the central nervous system
### ICD-10: G70 - Myasthenia Gravis
### ICD-10: G12.21 - Amyotrophic lateral sclerosis
### ICD-10: S14.101 - Unspecified injury at C1 level of cervical spinal cord
### ICD-10: S14.102 - Unspecified injury at C2 level of cervical spinal cord
### ICD-10: S14.103 - Unspecified injury at C3 level of cervical spinal cord
### ICD-10: S14.104 - Unspecified injury at C4 level of cervical spinal cord
### ICD-10: S14.105 - Unspecified injury at C5 level of cervical spinal cord
### ICD-10: S14.106 - Unspecified injury at C6 level of cervical spinal cord
### ICD-10: S14.107 - Unspecified injury at C7 level of cervical spinal cord
### ICD-10: S14.15 - Other incomplete lesions of the spinal cord
### ICD-10: S14.12 - Central cord syndrome of cervical spinal cord
### ICD-10: S14.109 - Unspecified injury at unspecified level of cervical spinal cord
### ICD-10: S14.10 - Unspecified injury at unspecified level of cervical spinal cord
### ICD-10: S14.1 - Other and Unspecified Injuries of the spinal cord
### ICD-10: D75.1 - Secondary Polycythemia
### ICD-10: F11 - Opioid related disorders
### ICD-10: T40 - Poisoning by narcotics and psychedleics
### ICD-10: E84 - Cystic fibrosis
### ICD-10: J45 - Asthma
### ICD-10: J44 - COPD
### ICD-10: J43 - Emphysema
### ICD-10: I50 - Heart failure

num_spreadsheets = 1273

d_type_spec = {"patient_id":str, 
               "encounter_id":str, 
               "code":'category', 
               "date":'category'}

match_str = "I27.1*|I27.9*|I27.81*|I27.2*|G47.3*|G95*|G71*|G35*|G36*|G37*|G70*|G12.21*|S14.101*|S14.102*|S14.103*|S14.104*|S14.105*|S14.106*|S14.107*|S14.15*|S14.12*|S14.109*|S14.10*|S14.1*|D75.1*|F11*|T40*|E84*|J45*|J44*|J43*|I50*"

sub_blocks = []
for i in range(1, num_spreadsheets+1):
    print(f'{i:04}')
    diagnosis = pd.read_csv(working_dir + "diagnosis_NEW_"+f'{i:04}'+".csv", 
        usecols = ["patient_id","encounter_id","code","date"],
        dtype=d_type_spec)
    diagnosis = diagnosis[diagnosis["code"].str.match(match_str)]
    sub_blocks.append(diagnosis)
    del diagnosis

print(f"Available memory: {psutil.virtual_memory().available * 1e-9} GB")
process = psutil.Process()
print(f"Current notebook process memory usage: {process.memory_info().rss * 1e-9} GB")
value_SysBP = pd.concat(sub_blocks, ignore_index = True)
print(f'Memory usage: {value_SysBP.memory_usage(deep=True)}')

RFS_PREDISPOSITION = pd.concat(sub_blocks, ignore_index = True)
RFS_PREDISPOSITION.to_csv(working_dir +"RFS_PREDISPOSITION.csv", index = False)
display(RFS_PREDISPOSITION.head())
print("\n")
print(RFS_PREDISPOSITION.shape)
print("\n")
print(RFS_PREDISPOSITION["code"].value_counts())
del  RFS_PREDISPOSITION


0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


Unnamed: 0,patient_id,encounter_id,code,date
0,DRB,GhL,J41.0,20161005
1,DhB,GxW,F11,20220621
2,DxB,GBh,D75.1,20220513
3,DxB,GBh,D75.1,20220513
4,DxB,GBh,D75.1,20220513




(100495138, 4)


code
G47.33      10892503
J44.9        7947585
J45.909      4503448
I50.9        4190301
G35          3252083
              ...   
T46.994S           1
T48.1X1S           1
T42.8X2S           1
T40.7X4D           1
T42.8X6D           1
Name: count, Length: 2733, dtype: int64


'\nRFS_PREDISPOSITION = pd.DataFrame()\n\n# diagnosis1.csv \ndiagnosis1 = pd.read_csv("/Users/wwaynerichards/Desktop/Practicum/New Data/Current Diagnosis/diagnosis1.csv")\ndiagnosis1 = diagnosis1[diagnosis1["code"].str.contains("I27.1*|I27.9*|I27.81*|I27.2*|G47.3*|G95*|G71*|G35*|G36*|G37*|G70*|G12.21*|S14.101*|S14.102*|S14.103*|S14.104*|S14.105*|S14.106*|S14.107*|S14.15*|S14.12*|S14.109*|S14.10*|S14.1*|D75.1*|F11*|T40*|E84*|J45*|J44*|J43*|I50*")]\nRFS_PREDISPOSITION = pd.concat([RFS_PREDISPOSITION, diagnosis1], ignore_index = True)\ndel diagnosis1\n\n# diagnosis2.csv \ndiagnosis2 = pd.read_csv("/Users/wwaynerichards/Desktop/Practicum/New Data/Current Diagnosis/diagnosis2.csv")\ndiagnosis2 = diagnosis2[diagnosis2["code"].str.contains("I27.1*|I27.9*|I27.81*|I27.2*|G47.3*|G95*|G71*|G35*|G36*|G37*|G70*|G12.21*|S14.101*|S14.102*|S14.103*|S14.104*|S14.105*|S14.106*|S14.107*|S14.15*|S14.12*|S14.109*|S14.10*|S14.1*|D75.1*|F11*|T40*|E84*|J45*|J44*|J43*|I50*")]\nRFS_PREDISPOSITION = pd.concat([R