In [None]:
import os
import re
# import dill
import json
import pickle

import numpy as np
import pandas as pd
import urllib.request
# import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
from tqdm import tqdm

### 0 load from checkpoint

In [None]:
import gc

dx_cases_all = []
dx_rx_controls_all = []

demo_site_all = []
dx_site_all = []
rx_site_all = []
lab_site_all = []
vital_site_all = []

for site in ['wcm', 'columbia', 'nyu', 'mshs', 'montefiore']:
    print('Load from site', site)
    dx_cases, dx_rx_controls, demo_site, dx_site, rx_site, lab_site,\
        vital_site, merged_cases_compare, merged_controls_dates, \
            comorbidity_scores = pickle.load(open(f'./Middle/{site}_site_resources.pkl', 'rb'))
    print(f'--Include cases from site {site}', len(dx_cases))  
    print(f'--Include controls from site {site}', len(dx_rx_controls))    
  
    dx_cases_all.append(dx_cases)
    dx_rx_controls_all.append(dx_rx_controls)
    
    demo_site_all.append(demo_site) 
    dx_site_all.append(dx_site) 
    rx_site_all.append(rx_site) 
    lab_site_all.append(lab_site) 
    vital_site_all.append(vital_site) 
    

    del dx_cases, dx_rx_controls, demo_site, dx_site, rx_site, lab_site,\
        vital_site, merged_cases_compare, merged_controls_dates, \
            comorbidity_scores
    gc.collect()
    
cases_all = [i for idlist in dx_cases_all for i in idlist]
controls_all = [i for idlist in dx_rx_controls_all for i in idlist]

print('All cases ', len(cases_all), len(set(cases_all)))
print('All control ', len(controls_all), len(set(controls_all)))

In [None]:
demo_site_all_df = pd.concat(demo_site_all, axis=0)
dx_site_all_df = pd.concat(dx_site_all, axis=0)
rx_site_all_df = pd.concat(rx_site_all, axis=0)
lab_site_all_df = pd.concat(lab_site_all, axis=0)
vital_site_all_df = pd.concat(vital_site_all, axis=0)


del demo_site_all, dx_site_all, rx_site_all, lab_site_all, vital_site_all
print('DF for domains demo: ', demo_site_all_df.shape, '\n\t', demo_site_all_df.patid.nunique())
print('DF for domains dx: ' , dx_site_all_df.shape, '\n\t', dx_site_all_df.patid.nunique())
print('DF for domains rx: ', rx_site_all_df.shape, '\n\t', rx_site_all_df.patid.nunique())
print('DF for domains lab: ', lab_site_all_df.shape, '\n\t', lab_site_all_df.patid.nunique())
print('DF for domains vital: ', vital_site_all_df.shape, '\n\t', vital_site_all_df.patid.nunique())
gc.collect()


pickle.dump(demo_site_all_df, open( './MiddleFeatures/demo_site_all_df.pkl', 'wb'))

pickle.dump(dx_site_all_df, open( './MiddleFeatures/dx_site_all_df.pkl', 'wb'))

pickle.dump(rx_site_all_df, open( './MiddleFeatures/rx_site_all_df.pkl', 'wb'))

pickle.dump(lab_site_all_df, open( './MiddleFeatures/lab_site_all_df.pkl', 'wb'))

pickle.dump(vital_site_all_df, open( './MiddleFeatures/vital_site_all_df.pkl', 'wb'))



#### ICD

##### 1.1 using ICD to phecode mappings

In [None]:
## phecode 
to_Phewas = pd.read_csv(f"./icd2phecode.csv", sep = ',', dtype={'ICD': str, 'Phecode': str}).rename(columns={'Phecode': 'phecode'})

ICD9_to_Phewas = to_Phewas[to_Phewas.Flag == 9]
ICD10_to_Phewas = to_Phewas[to_Phewas.Flag == 10]

ICD9_to_Phewas = ICD9_to_Phewas[['ICD', 'phecode']]  
ICD10_to_Phewas = ICD10_to_Phewas[['ICD', 'phecode']]  


ICD9_dict = dict(zip(ICD9_to_Phewas['ICD'], ICD9_to_Phewas['phecode'] ))
ICD10_dict = dict(zip(ICD10_to_Phewas['ICD'], ICD10_to_Phewas['phecode'] ))



# ======
# defining list of ADRD
ADRD_dx_med_codes = pd.read_csv("./ADRD_dx_med_codes.csv")
ADRD_dx_med_codes.loc[(ADRD_dx_med_codes['Description']=="Pick's disease") & (ADRD_dx_med_codes['Code']=="33111"), 'Code'] = '331.11'

ADRD_STRINGS = ["Alzheimer's disease", "Vascular dementia", "Frontotemporal dementia", "Lewy Body Dementia"]
ADRD_STRINGS = '|'.join(ADRD_STRINGS)

ADRD_ICD9 = ADRD_dx_med_codes[(ADRD_dx_med_codes['Code_type'] == 'ICD-9') & ADRD_dx_med_codes['Concept'].str.contains(ADRD_STRINGS)] 
ADRD_ICD10 = ADRD_dx_med_codes[(ADRD_dx_med_codes['Code_type'] == 'ICD-10') & ADRD_dx_med_codes['Concept'].str.contains(ADRD_STRINGS)] 

ADRD_ICD9_phecodes = ADRD_ICD9.merge(ICD9_to_Phewas, left_on = 'Code', right_on = 'ICD', how='left'  )['phecode'].tolist()
ADRD_ICD10_phecodes = ADRD_ICD10.merge(ICD10_to_Phewas, left_on = 'Code', right_on = 'ICD', how='left'  )['phecode'].tolist()

# display(ADRD_ICD10_dict)
print('ADRD_ICD9_phecodes: ', ADRD_ICD9_phecodes)
print('ADRD_ICD10_phecodes: ', ADRD_ICD10_phecodes)


# ======
# defining list of ADRD and other dementia
ADRD_AND_OTHER_CONDITIONS = ADRD_STRINGS +'|'+ '|'.join(["Dementia", "Conditions cause dementia"])

ADRD_AND_OTHER_ICD9 = ADRD_dx_med_codes[(ADRD_dx_med_codes['Code_type'] == 'ICD-9') & ADRD_dx_med_codes['Concept'].str.contains(ADRD_AND_OTHER_CONDITIONS)] 
ADRD_AND_OTHER_ICD10 = ADRD_dx_med_codes[(ADRD_dx_med_codes['Code_type'] == 'ICD-10') & ADRD_dx_med_codes['Concept'].str.contains(ADRD_AND_OTHER_CONDITIONS)] 

ADRD_AND_OTHER_ICD9_PHECODES = ADRD_AND_OTHER_ICD9.merge(ICD9_to_Phewas, left_on = 'Code', right_on = 'ICD', how='left'  )['phecode'].tolist()
ADRD_AND_OTHER_ICD10_PHECODES = ADRD_AND_OTHER_ICD10.merge(ICD10_to_Phewas, left_on = 'Code', right_on = 'ICD', how='left'  )['phecode'].tolist()

print('ADRD_AND_OTHER_ICD9_PHECODES: ', ADRD_AND_OTHER_ICD9_PHECODES)
print('ADRD_AND_OTHER_ICD10_PHECODES: ', ADRD_AND_OTHER_ICD10_PHECODES)



# ======
# defining list of medication rxcui
ANTI_DEMENTIA_RXCUI = ADRD_dx_med_codes[(ADRD_dx_med_codes['Concept'] == "Anti-dementia medications") & (ADRD_dx_med_codes['Code_type'] == 'RXCUI')].Code.reset_index(drop=True)

ANTI_DEMENTIA_RXCUI_list = ANTI_DEMENTIA_RXCUI.tolist()
print('ANTI_DEMENTIA_RXCUI_list:', ANTI_DEMENTIA_RXCUI_list)




In [None]:
## dx to phecode 
dx_enc_df9 = dx_site_all_df[dx_site_all_df.dx_type==9]
dx_enc_df10 = dx_site_all_df[dx_site_all_df.dx_type==10]
print('Get df9 ', dx_enc_df9.shape)
print('Get df10 ', dx_enc_df10.shape)

dx_enc_df9['phecode'] = dx_enc_df9['dx'].map(ICD9_dict ) 
dx_enc_df10['phecode'] = dx_enc_df10['dx'].map(ICD10_dict ) 
print('Map 9 to phecode: ', dx_enc_df9.shape)
print('Map 10 to phecode: ', dx_enc_df10.shape)

print(f"-We have non-na {dx_enc_df9[~dx_enc_df9.phecode.isna()].shape[0]} phecode Rows in dx9")
print(f"-We have non-na {dx_enc_df10[~dx_enc_df10.phecode.isna()].shape[0]} phecode Rows in dx10")


dx_enc_phe = pd.concat([dx_enc_df9, dx_enc_df10], axis=0)
print('--Mapping dx_enc_df to phecode based on ICD9 and ICD10 columns', dx_enc_phe.shape)
print(f"--{round(100*(dx_enc_phe['phecode'].isna().sum() / dx_enc_phe.shape[0]), 2)}% of encounters have finally a null Phecode")
print(f"--We have {dx_enc_phe[~dx_enc_phe.phecode.isna()].phecode.nunique()} unique Phecodes")


print('Before dropping null phecode', dx_enc_phe.shape)
dx_enc_phe = dx_enc_phe[~dx_enc_phe['phecode'].isna()] 
print('Drop null phecode, finally', dx_enc_phe.shape)  

dx_enc_phe = dx_enc_phe.drop_duplicates()
print('Drop duplicates, finally', dx_enc_phe.shape)  

del dx_enc_df9, dx_enc_df10


In [None]:

pickle.dump(dx_enc_phe, open( './MiddleFeatures/processed_dx_enc_phe.pkl', 'wb'))
del dx_enc_phe

In [None]:
# del dx_enc_phe
gc.collect()

### proprocess medication data, RXCUI to ingredient level

In [None]:
## function
def get_med_ingredients_multi_aou(rxcui_list, ohsu_ing_dict, existing_dict):
    NaN_counts = 0
    ohsu_ing_dict = {}
    nan_list = []
    
    for rxcui in tqdm(rxcui_list):
        if rxcui in existing_dict:
            continue
        try:
            rxcui_ingredient = urllib.request.urlopen(f"https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/related.json?tty=IN").read().decode()
            rxcui_ingredient = json.loads(rxcui_ingredient)
            rxcui_ingredient = [item['rxcui'] for item in rxcui_ingredient['relatedGroup']['conceptGroup'][0]['conceptProperties']]
        except:
            ohsu_ing_dict[rxcui] = np.nan
#                 print('nan ing', rxcui)
            NaN_counts+=1
            nan_list.append(rxcui)
            continue
        ohsu_ing_dict[rxcui] = rxcui_ingredient
    return ohsu_ing_dict, NaN_counts


if 'rxcui_ing' not in rx_site_all_df.columns:

    exsiting_ing_dict = pickle.load(open('ing_dict_multi.pkl', 'rb'))

    int_exsiting_ing_dict = {}
    for k,v in exsiting_ing_dict.items():
        try:
            intk = int(k)
        except:
            continue
        
        int_exsiting_ing_dict[intk] =  v

    print('\nExisting ing dict from ohsu and all of us: ', len(exsiting_ing_dict))

    print('--Convert from existing dictionary to int dictionary: ', len(int_exsiting_ing_dict))
    rx_site_all_df['rxnorm_cui'] = rx_site_all_df['rxnorm_cui'].astype(int)

    unique_rxcui = rx_site_all_df.rxnorm_cui.unique().tolist()
    insight_ing_dict = {}

    # print(type(unique_rxcui[0]))
    insight_ing_dict, na_counts = get_med_ingredients_multi_aou(unique_rxcui, insight_ing_dict, int_exsiting_ing_dict)

    print('--get insight_ing dict / New na_counts', len(insight_ing_dict), na_counts)

    insight_ing_dict = insight_ing_dict | int_exsiting_ing_dict
    print('Update to current insight_ing_dict', len(insight_ing_dict))

    rx_site_all_df['rxcui_ing'] = rx_site_all_df['rxnorm_cui'].apply(lambda x: insight_ing_dict[x])
    print('\n--Mapped to rxcui ing df', rx_site_all_df.shape)
    print(f"--{round(100*(rx_site_all_df['rxcui_ing'].isna().sum() / rx_site_all_df.shape[0]), 2)}% of encounters have finally a null rxcui_ing")


    rx_site_all_df = rx_site_all_df.dropna(subset=['rxcui_ing'],inplace=False)
    print('--removing null rxcui ing, now: ', rx_site_all_df.shape)

    rx_site_all_df = rx_site_all_df.explode('rxcui_ing')
    rx_site_all_df['rxcui_ing'] = rx_site_all_df['rxcui_ing'].astype(int)
    print('--exploded to', rx_site_all_df.shape)
    rx_site_all_df = rx_site_all_df.dropna(subset=['rxcui_ing'],inplace=False)
    print('----removing null rxcui ing, now:', rx_site_all_df.shape)
    rx_site_all_df = rx_site_all_df.drop_duplicates()
    print('----removing duplicates, now:', rx_site_all_df.shape)

    print(f"We have {rx_site_all_df.rxcui_ing.nunique()} unique rxcui_ing")

else:
    print('Already having this ingredient column.')


# rx_site_all_df = rx_site_all_df.drop_duplicates()


In [None]:
pickle.dump(rx_site_all_df, open( './MiddleFeatures/processed_rx_ing.pkl', 'wb'))


In [None]:
del rx_site_all_df
gc.collect()

## Lab test data

In [None]:
display(lab_site_all_df.head())
lab_site_all_df.groupby('lab_loinc')['result_unit'].apply(set)


In [None]:
c_codes_df = pd.read_excel('Collected_lab_tests_for_ADRD_evaluation_M_online.xlsx', skiprows=4)
c_codes_df.columns = ['Class',	'Code',	'Name'	,'Acceptable Sample Source'	,'Source Suitable',	'Rangelow',	'Rangehigh',	'Unit of Ranges',	'Range Specs',	'Range Applicable',	'Range reference', ' Corresponding test in paper 1', 'Note']
c_codes = c_codes_df['Code'].tolist() # it is str

print(f'There are {c_codes_df.Class.nunique()} unique lab test names in the codes')
print(f'There are {c_codes_df.Code.nunique()} unique lab test loincs in the codes')

c_codes_df.loc[:, 'Class'] = c_codes_df['Class'].str.lower()
c_codes_df.loc[:, 'Unit of Ranges'] = c_codes_df['Unit of Ranges'].str.lower()
display(c_codes_df.head(1))
# for i in range(c_codes_df.shape[0]):
#     display(c_codes_df[i:i+1])
    

In [None]:

merge_chunks = []
chunk_size = 5000000
removelist = ['ni', 'ot', 'nan']

for start in tqdm(range(0, len(lab_site_all_df), chunk_size)):
    print('Chunk:', start)

    end = min(start + chunk_size, len(lab_site_all_df))
    chunk = lab_site_all_df.iloc[start:end]
    chunk['result_unit'] = chunk['result_unit'].str.lower()
    print('to lower case')

    print('--bef: ', chunk.shape)

    chunk = chunk[~chunk['result_unit'].isin(set(removelist))]
    print('--after removal unselect units: ', chunk.shape)
 
    # chunk = chunk[~chunk['result_unit'].isin(['ni'])]
    # print('--after removal unselect units ni: ', chunk.shape)

    # chunk = chunk[~chunk['result_unit'].isin(['ot'])]
    # print('--after removal unselect units ot: ', chunk.shape)

    # chunk = chunk[~chunk['result_unit'].isin(['nan'])]
    # print('--after removal unselect units nan: ', chunk.shape)

    chunk_lab_merge = chunk.merge(c_codes_df[['Code', 'Class', 'Name', 'Rangelow', 'Rangehigh', 'Unit of Ranges',]], left_on='lab_loinc', right_on = 'Code', how='left').drop('Code', axis=1)

    merge_chunks.append(chunk_lab_merge)
    del chunk

# chunk_lab_merge.head()

In [None]:
lab_merge = pd.concat(merge_chunks, axis=0)
lab_merge.loc[:, 'Class'] = lab_merge['Class'].str.strip().str.lower()
lab_merge = lab_merge.drop(['norm_range_low', 'norm_range_high', 'Name'], axis=1)
pickle.dump(lab_merge, open('MiddleFeatures/lab_merge.pkl', 'wb'))

In [None]:
lab_merge = pickle.load(open('MiddleFeatures/lab_merge.pkl', 'rb'))


In [None]:
# use unit conversion if needed

conversion_factors_dict_update3 = {
# "alanine aminotransferase [alt]": {
#     "[iu]/l": 1, "[u]/l": 1, "iu/l": 1,  "u/l":1, "range_unit": "u/l"
# },
"aspartate aminotransferase [ast]": {
    "[iu]/l": 1, "[u]/l": 1, "iu/l": 1,  "u/l":1,  "range_unit": "u/l"
},
"c reactive protein": {
    "mg/dl": 1, "mg/l": 0.1, "range_unit": "mg/dl"   
},# 0.1 
"free t3, serum": {
      "ng/l": 1,   "pg/ml": 1, "range_unit": "ng/dl"   
}, # no need of unit conversion due to record error
"glucose": {
    "mg/dl": 0.0555, "mmol/l":1, "range_unit": "mmol/l"   
}, # need to convert to mmol/l need to convert to range_unit
"hemoglobin": {
    "g/l": 0.1, "g/dl{calc}": 1, "g/dl": 1, "range_unit": "g/dl"   
}, # check
"high-sensitivity c-reactive protein": {
    "mg/dl": 10, "mg/l": 1, "range_unit": "mg/l"  
}, # seems right conversion
"immature granulocytes, ig": {
    "10*3/ul": 1,  "k/ul": 1,  "10*3/mm3": 1, "/ul": 0.001, "range_unit": "k/ul"  # nL 转换为 k/uL
}, # 

"lymphocytes": {
    "10*3/ul": 1, "10*9/l": 1,  "k/ul": 1,  \
        "k/mm3": 1, "10*3/mm3": 1, "{cells}/ul": 0.001, "/ul": 0.001, "range_unit": "k/ul"  # nL 转换为 k/uL
},# checked
"platelet count": {
    "10*3/ul": 1,  "k/ul" :1, "range_unit": "k/ul"  
},
"rbc count": {
     "10*6/ul": 1, "10*3/mm3": 1, "m/ul": 1, "million/ul":1, "range_unit": "million/ul"  # nL 转换为 million/uL
},# # no need of unit conversion due to record error
"sedimentation rate": {
    "mm/h": 1, "range_unit": "mm/h",
},
"serum albumin": {
    "g/dl{calc}": 1, "g/dl": 1, "ug/mg": 0.0001,  "g/l": 0.1, "range_unit": "g/dl" 
}, # check record wrong  0.0001, 0.01 # only convert part
"serum alkaline phosphatase": {
   "[u]/l": 1, "iu/l": 1, "u/l":1, "range_unit": "u/l"  
},  
# "serum calcium": {
#     "mg/dl": 1, "range_unit": "mg/dl"
# },
"serum calcium, ionized": {
    "mmol/l": 1, "mg/dl": 0.25, "range_unit": "mmol/l"   
}, 
"serum magnesium": {
    "10*-3.eq/l":1.2155, "mg/dl": 1, "range_unit": "mg/dl"   
},
"serum sodium": {
    "mmol/l": 1,               
    "10*-6.eq/l": 1,      
    "mg/dl": 1,                    
    "range_unit": "mmol/l"
}, #no need of unit conversion due to record error
"total white blood cell count": {
    "10*3/ul": 1,   
    "k/ul":1,
    "range_unit": "k/ul"
}, 
"urine urea nitrogen": {
    "g/(24.h)": 1,             
    "mg/dl": 1,             
    "grams/24h": 1,  
    "range_unit": "grams/24h"
}, #no need of unit conversion due to record error
"red cell distribution width":{'%':1}  
}




In [None]:


def extract_gender_specific_range(range_str, gender):
    # if range_str is None:
    #     return np.nan
    # if  '/' in range_str:
    # if isinstance(range_str, str):
    try:
        parts = dict(item.split(':') for item in range_str.split('/'))

        range_f = float(parts.get(gender[0], np.nan))
        return range_f
    except:
        return np.nan
    # else:
    #     return float(range_str)

cfactorlist = set(conversion_factors_dict_update3.keys())

def evaluate_lab_values(df, conversion_factors_dict, gender_context):
    chunk_size = 500000
    
    processed_chunks = []

    for start in tqdm(range(0, len(df), chunk_size)):
        print('------ Chunk', start)


        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end] 
        # chunk.loc[:, 'Class'] = chunk['Class'].str.strip()
            
        chunk['cfactor'] = 1.0
        mask = chunk.Class.isin(cfactorlist)
        chunk.loc[mask, 'cfactor'] = chunk.loc[mask].apply(
            lambda row: conversion_factors_dict.get(row['Class'], {}).get(row['result_unit'], 1), axis=1
        )

        chunk['Rangelow'] = chunk['Rangelow'].fillna('').astype(str)
        chunk['Rangelow_f'] = chunk['Rangelow']
        mask = chunk['Rangelow'].str.contains('/', na=False)

        chunk.loc[mask, 'Rangelow_f'] = chunk.loc[mask].apply(
            lambda row: extract_gender_specific_range(row['Rangelow'], gender_context.get(row['patid'])), axis=1
        ) 
        chunk.drop('Rangelow', axis=1, inplace=True)
        chunk['Rangelow_f'] = pd.to_numeric(chunk['Rangelow_f'], errors='coerce')

        chunk['Rangehigh'] = chunk['Rangehigh'].fillna('').astype(str)

        chunk['Rangehigh_f'] = chunk['Rangehigh']
        mask2 = chunk['Rangehigh'].str.contains('/' , na=False)

        chunk.loc[mask2, 'Rangehigh_f'] = chunk.loc[mask2].apply(
            lambda row: extract_gender_specific_range(row['Rangehigh'], gender_context.get(row['patid'])), axis=1
        ) 
        chunk.drop('Rangehigh',axis=1, inplace=True)
        chunk['Rangehigh_f'] = pd.to_numeric(chunk['Rangehigh_f'], errors='coerce')


        chunk['cvalue'] =  np.multiply(chunk['result_num'].values,  chunk['cfactor'].values)
        chunk.dropna(subset='cvalue', inplace=True)
        # display(chunk.dtypes)
        conditions = [
            (chunk['cvalue'] < chunk['Rangelow_f']),
            (chunk['cvalue'] > chunk['Rangehigh_f']),
            ((chunk['cvalue'] >= chunk['Rangelow_f']) & (chunk['cvalue'] <= chunk['Rangehigh_f']))
        ]
        choices = ['ablow', 'abhigh', 'nor']

        chunk['flag'] = np.select(conditions, choices, default=None)

        print('\t\tafter condition matching, nan flag count: ', chunk['flag'].isna().sum())
        
        chunk.loc[chunk['Rangelow_f'].isnull() & (chunk['cvalue'] > chunk['Rangehigh_f']), 'flag'] = 'abhigh'
        chunk.loc[chunk['Rangelow_f'].isnull() & (chunk['cvalue'] <= chunk['Rangehigh_f']), 'flag'] = 'nor'
        chunk.loc[chunk['Rangehigh_f'].isnull() & (chunk['cvalue'] < chunk['Rangelow_f']), 'flag'] = 'ablow'
        chunk.loc[chunk['Rangehigh_f'].isnull() & (chunk['cvalue'] >= chunk['Rangelow_f']), 'flag'] = 'nor'
        
        print('\t\tafter adjustment, nan flag count: ', chunk['flag'].isna().sum())

        chunk = chunk.dropna(subset=['flag'])

        processed_chunks.append(chunk)
        if start % chunk_size == 30:
            print( gc.collect())
    processed_df = pd.concat(processed_chunks, ignore_index=True)
    del processed_chunks
    return processed_df



In [None]:

gender_context = dict(zip(demo_site_all_df['patid'].tolist(), demo_site_all_df['sex'].tolist()))
gender_context = {k:v for k, v in gender_context.items() if v in ['F', 'M']}
print('Female/male context for patient number: ', len(gender_context))

gender_context_ce = {k: ('Female' if v == 'F' else 'Male') for k, v in gender_context.items()}

print('Gender context for patient number: ', len(gender_context_ce), gender_context_ce)



In [None]:

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
processed_lab =  evaluate_lab_values(lab_merge, conversion_factors_dict_update3, gender_context_ce) 



In [None]:

from scipy import stats

processed_lab['z_score'] = processed_lab.groupby(['lab_loinc'])['cvalue'].transform(lambda x: stats.zscore(x, nan_policy='omit'))

processed_lab_z = processed_lab[(processed_lab['z_score'] >= -3) & (processed_lab['z_score'] <= 3)]
del processed_lab
gc.collect()
print('--after removing z-score outliers ', processed_lab_z.shape)

#processed_lab_z = processed_lab_z.drop_duplicates()


In [None]:
pickle.dump(processed_lab_z, open( './MiddleFeatures/processed_lab_flag.pkl', 'wb'))


In [14]:

part_processed_lab = processed_lab_z[['patid', 'lab_loinc', 'specimen_date', 'Class', 'cvalue', 'flag' ]]

befsize = part_processed_lab.shape[0]
part_processed_lab = part_processed_lab.drop_duplicates()
del processed_lab_z

In [15]:
pickle.dump(part_processed_lab, open( './MiddleFeatures/part_processed_lab_flag.pkl', 'wb'))




#### Vital signs

#####  BMI, SBP and SDP

In [None]:
vital_ht_z = vital_site_all_df[~vital_site_all_df['ht'].isna()]
ht_z = stats.zscore(vital_ht_z['ht'] )
vital_ht_z = vital_ht_z[(ht_z >= -3) & (ht_z <= 3)]
print('remove by zscore')

vital_wt_z = vital_site_all_df[~vital_site_all_df['wt'].isna()]
wt_z = stats.zscore(vital_wt_z['wt'] )
vital_wt_z = vital_wt_z[(wt_z >= -3) & (wt_z <= 3)]
print('remove by zscore')


vital_dis_z = vital_site_all_df[~vital_site_all_df['diastolic'].isna()]
dis_z = stats.zscore(vital_dis_z['diastolic'] )
vital_dis_z = vital_dis_z[(dis_z >= -3) & (dis_z <= 3)]
print('remove by zscore')


vital_sys_z = vital_site_all_df[~vital_site_all_df['systolic'].isna()]
sys_z = stats.zscore(vital_sys_z['systolic'] )
vital_sys_z = vital_sys_z[(sys_z >= -3) & (sys_z <= 3)]
print('remove by zscore')

vital_all = pd.concat([vital_ht_z, vital_wt_z, vital_dis_z, vital_sys_z], axis=0)
print('--Ori vital data: ', vital_site_all_df.shape)
vital_all = vital_all.drop_duplicates()

print('--after dropping duplicates and zscore: ', vital_all.shape)
print('\t--dropped duplicates: ', 1 - vital_all.shape[0] / vital_site_all_df.shape[0])

befsize = vital_all.shape[0]
vital_all = vital_all.dropna(subset=['ht', 'wt','diastolic', 'systolic' ], how='all')

print('--after dropping all-nan rows : ', vital_all.shape)
print('\t--dropped duplicates rate: ', 1 - vital_all.shape[0] / befsize)



In [158]:
pickle.dump(vital_all, open( './MiddleFeatures/processed_vital_continuous.pkl', 'wb'))


In [16]:
vital_all = pickle.load( open( './MiddleFeatures/processed_vital_continuous.pkl', 'rb'))


In [19]:
vital_all.describe()

Unnamed: 0,measure_date,ht,wt,diastolic,systolic
count,134628321,24635010.0,31878880.0,78114430.0,78114430.0
mean,2019-08-13 14:52:55.797454848,65.49759,693.5339,73.1731,127.0109
min,1996-03-26 00:00:00,-69.0,-839.0,-267.0,-171.0
25%,2018-02-14 00:00:00,62.8,152.0,65.0,113.0
50%,2020-04-27 00:00:00,65.0,185.8,73.0,125.0
75%,2021-12-14 00:00:00,68.0,255.0,81.0,140.0
max,2023-06-28 00:00:00,989.0,4537.6,9370.0,12076.0
std,,5.092819,1063.803,13.65469,20.39898
