# Generate indicator variables for disease of interests: RA

In [3]:
import os
import pandas as pd
import numpy as np
import math

In [4]:
os.chdir('/temp_project/all_codes')
from ukbb_ldbf import load_data_by_fid

from ukbb import search_des
from ukbb import related_vars
from ukbb import mm_gen_ind_list






In [49]:
os.chdir('/temp_project/all_codes')
from ukbb_ldbf import load_data_by_fid

df_tab1_i0_comp=pd.read_csv('/temp_project/ukbb/data/i0/ukb22598_i0_comp.csv')

def chk_unique_eid(df):
    """
    Check unique eid number for a dataframe
    """
    print('loaded df has unique eid count: '+ str(len(df.eid.unique())))

    
    
def search_des(keyword):
    """
    search 'keyword' related variable based on the variable description
    """
    klow=str(keyword).lower()
    df_tab1_i0_comp['des']=df_tab1_i0_comp.Description.str.lower()
    key_des=df_tab1_i0_comp[df_tab1_i0_comp.des.str.contains(klow)][['fid','Type','obs_ct','Description','DC']]
    return key_des


def related_vars(key_list, dis):
    """
    return a dataframe contains all searched 'keyword' related variable in 'key_list'
    """
            
    savepath1='/temp_project/ukbb/data/disease/'     ##### CHANGE path if needed
    savepath2=savepath1+str(dis).upper()
    
    if os.path.exists(savepath2):
        os.chdir(savepath2)
        d_lst=[]
        for k in key_list:
            df_k=search_des(str(k).strip())
            d_lst.append(df_k)

        d_coma=pd.concat(d_lst)
        d_comb=d_coma.drop_duplicates()
        print('Searched keyword(s): '+str(key_list)+'\n'+'save '+str(dis)+'_related_vars_chk.csv file at '+str(savepath2))
        filename=str(dis)+'_related_vars_chk.csv'
        d_comb.to_csv(filename, index=None)
        return d_comb
    
    else: 
        os.mkdir(savepath2)
        os.chdir(savepath2)
        d_lst=[]
        for k in key_list:
            df_k=search_des(str(k).strip())
            d_lst.append(df_k)

        d_coma=pd.concat(d_lst)
        d_comb=d_coma.drop_duplicates()
        print('Searched keyword(s): '+str(key_list)+'\n'+'save '+str(dis)+'_related_vars_chk.csv file at '+str(savepath2))
        filename=str(dis)+'_related_vars_chk.csv'
        d_comb.to_csv(filename, index=None)
        return d_comb




def lst_ind(dfa_list,ind_val):
    """
    return a list of icd code that match with 'ind_val'
    """
    pre0=[]
    for i in dfa_list:
        if pd.isnull(i):
            pre0.append([])
        elif pd.notnull(i):
            si=[]
            jl=i.split(',')
            for ei in jl:
                ef=ei.replace(',','')
                efa,efb,efc=ef.partition(str(ind_val))
                if efa=='':
                    si.append(ef)
            pre0.append(si)
    return pre0


def mm_gen_ind_raw(fid_int,key_code,evnt, detail=False, get_ct=False, ct_only=False):
    """
    return a dataframe that contains indicator variable for a specific 'key_code' in UKBB std data
        use 'detail=True' to get the detail matched code info
        use 'get_ct=True' to get the count for matched code
        use 'ct_only=True' to return count only
    """

    dfc=load_data_by_fid(fid_int)
    #df_icd9m=dfc.copy()
    dfa=dfc.copy()

    dfa_lst=dfa[dfa.columns[1]].values.tolist()
    
    pre0=lst_ind(dfa_lst,str(key_code))
    
    gen_fid_name='fid'+str(fid_int)+'_'+str(evnt)+str(key_code)
    gen_ind_name='ind'+str(fid_int)+'_'+str(evnt)+str(key_code)
    gen_count_name='count'+str(fid_int)+'_'+str(evnt)+str(key_code)
    
    dfa[str(gen_fid_name)]=pre0
    dfa[dfa.columns[dfa.columns.get_loc(str(gen_fid_name))]]=dfa[dfa.columns[dfa.columns.get_loc(str(gen_fid_name))]].apply(lambda y: np.nan if len(y)==0 else y )
    
    dfa[str(gen_ind_name)]=pre0
    dfa[dfa.columns[dfa.columns.get_loc(str(gen_ind_name))]]=dfa[dfa.columns[dfa.columns.get_loc(str(gen_ind_name))]].apply(lambda y: 0 if len(y)==0 else 1 )
    
    dfa[str(gen_count_name)]=pre0
    dfa[dfa.columns[dfa.columns.get_loc(str(gen_count_name))]]=dfa[dfa.columns[dfa.columns.get_loc(str(gen_count_name))]].apply(lambda y: 0 if len(y)==0 else len(y) )
    
    print('fid '+str(fid_int)+' ',str(evnt)+str(key_code)+' count: '+str(dfa[dfa.columns[dfa.columns.get_loc(str(gen_fid_name))]].count())+' ind from '+str(dfa[dfa.columns[dfa.columns.get_loc(str(gen_ind_name))]].count()))
    dfb=dfa[['eid',str(gen_ind_name),str(gen_count_name)]]
    #dfb=dfa[['eid',str(gen_ind_name)]]
    
    if ct_only==False:
        if detail==True:
            if get_ct==True:
                return dfa
            if get_ct==False:
                return dfa.drop([str(gen_count_name)],axis=1)
        else:
            if get_ct==True:
                return dfb
            if get_ct==False:
                return dfb.drop([str(gen_count_name)],axis=1)
        
    if ct_only==True:
        return dfb.drop([str(gen_ind_name)],axis=1)

    
    
        
def mm_gen_ind_list(fid_in, key_code_list, evt, detai=False, get_ct=False, ct_only=False):
    """
    return a dataframe that contains indicator variables for each specific 'key_code' in 'key_code_list'
        use 'detai= True' to get the detail matched codes info
        use 'get_ct=True' to get the count for matched codes
        use 'ct_only=True' to return counts only
    """
    dfcl=[]
    
    if ct_only==False:
    
        if detai==False:
            if get_ct==False:
                for l in key_code_list:
                    df_l=mm_gen_ind_raw(fid_in, l, str(evt), detail=False, get_ct=False, ct_only=False)
                    dfcl.append(df_l)
                dfcl_merge=pd.concat(dfcl,axis=1)
                dfcl_merge=dfcl_merge.loc[:,~dfcl_merge.columns.duplicated()]  # drop duplicated 'eid' columns
                return dfcl_merge
        
            if get_ct==True:
                for l in key_code_list:
                    df_l=mm_gen_ind_raw(fid_in, l, str(evt), detail=False, get_ct=True, ct_only=False)
                    dfcl.append(df_l)
                dfcl_merge=pd.concat(dfcl,axis=1)
                dfcl_merge=dfcl_merge.loc[:,~dfcl_merge.columns.duplicated()]  # drop duplicated 'eid' columns
                return dfcl_merge
        
        
        
        if detai==True:
            if get_ct==False:
                for l in key_code_list:
                    df_l=mm_gen_ind_raw(fid_in, l, str(evt), detail=True, get_ct=False, ct_only=False)
                    dfcl.append(df_l)
                dfcl_merge=pd.concat(dfcl,axis=1)
                dfcl_merge=dfcl_merge.loc[:,~dfcl_merge.columns.duplicated()]  # drop duplicated 'eid' columns
                return dfcl_merge
        
            if get_ct==True:
                for l in key_code_list:
                    df_l=mm_gen_ind_raw(fid_in, l, str(evt), detail=True, get_ct=True, ct_only=False)
                    dfcl.append(df_l)
                dfcl_merge=pd.concat(dfcl,axis=1)
                dfcl_merge=dfcl_merge.loc[:,~dfcl_merge.columns.duplicated()]  # drop duplicated 'eid' columns
                return dfcl_merge

    if ct_only==True:
        for l in key_code_list:
            df_l=mm_gen_ind_raw(fid_in, l, str(evt), detail=False, get_ct=False, ct_only=True)
            dfcl.append(df_l)
        dfcl_merge=pd.concat(dfcl,axis=1)
        dfcl_merge=dfcl_merge.loc[:,~dfcl_merge.columns.duplicated()]  # drop duplicated 'eid' columns
        return dfcl_merge

In [5]:
## 53 Date info for ukbb
df53 = load_data_by_fid(53)

df53.columns=['eid','att_date']
df53['att_date']=pd.to_datetime(df53['att_date'])
df53['att_yr']=df53['att_date'].dt.year

df53.to_csv('/temp_project/ukbb/data/dfoi/ukbb_date.csv', index=None)

fid 53 is a single-measure date variable, which is 
Date of attending assessment centre


In [None]:
### identified varaibles

fid	Type	obs_ct	Description
41202	Categorical (multiple)	392281	Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
41203	Categorical (multiple)	20309	Diagnoses - main ICD9Uses data-coding 87 comprises 13710 String-valued members in a hierarchical tree.
41204	Categorical (multiple)	320437	Diagnoses - secondary ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
41205	Categorical (multiple)	8716	Diagnoses - secondary ICD9Uses data-coding 87 comprises 13710 String-valued members in a hierarchical tree.
20003	Categorical (multiple)	364052	Treatment/medication codeUses data-coding 4 comprises 6745 Integer-valued members in a simple list.
20002	Categorical (multiple)	375111	Non-cancer illness code, self-reportedUses data-coding 6 comprises 474 Integer-valued members in a hierarchical tree.


In [7]:
### count ICD9/10 codes for ukbb
### count in ICD 10 main
ukbb_icd10m=load_data_by_fid(fid=41202)
ukbb_icd10m['ct_vec_cam41202i0']=ukbb_icd10m['vec_cam41202i0'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
### count in ICD 10 second
ukbb_icd10s=load_data_by_fid(fid=41204)
ukbb_icd10s['ct_vec_cam41204i0']=ukbb_icd10s['vec_cam41204i0'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
### count in ICD 9 main
ukbb_icd9m=load_data_by_fid(fid=41203)
ukbb_icd9m['ct_vec_cam41203i0']=ukbb_icd9m['vec_cam41203i0'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
### count in ICD 9 second
ukbb_icd9s=load_data_by_fid(fid=41205)
ukbb_icd9s['ct_vec_cam41205i0']=ukbb_icd9s['vec_cam41205i0'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))

### count in ICD 10 main + 10 second
ukbb_icd10ms=pd.merge(ukbb_icd10m, ukbb_icd10s, on='eid')
### count in ICD 10 main + 10 second + 9 main
ukbb_icd10ms9m=pd.merge(ukbb_icd10ms, ukbb_icd9m, on='eid')
### count in ICD 10 main + 10 second + 9 main + 9 second
ukbb_icd10ms9ms=pd.merge(ukbb_icd10ms9m, ukbb_icd9s, on='eid')
ukbb_icd10ms9ms.count()
ukbb_icd10ms9ms.columns

### save RAW count in ICD 10 main + 10 second + 9 main + 9 second
ukbb_icd10ms9ms.to_csv('/temp_project/ukbb/data/i0/varOutcome/ukbb_icd10ms9ms_any_icd_raw_count.csv', index=None)


### save combined count in ICD 10 main + 10 second + 9 main + 9 second
ukbb_icd10ms9ms_temp=ukbb_icd10ms9ms[['eid']].copy()
ukbb_icd10ms9ms_temp['any_icd_ct']=ukbb_icd10ms9ms.drop(['eid', 'vec_cam41202i0','vec_cam41204i0', 
                                                       'vec_cam41203i0', 'vec_cam41205i0'],axis=1).sum(axis=1)
ukbb_icd10ms9ms_temp['any_icd_exist_ind']=ukbb_icd10ms9ms_temp['any_icd_ct'].apply(lambda y: 1 if y>0 else 0)
ukbb_icd10ms9ms_temp.to_csv('/temp_project/ukbb/data/i0/varOutcome/ukbb_icd10ms9ms_any_icd_count.csv', index=None)

fid 41202 is a multiple-measure categorical (multiple) variable
fid 41204 is a multiple-measure categorical (multiple) variable
fid 41203 is a multiple-measure categorical (multiple) variable
fid 41205 is a multiple-measure categorical (multiple) variable


In [19]:
ukbb_icd10ms9ms_temp.any_icd_exist_ind.value_counts()

1    395816
0    106786
Name: any_icd_exist_ind, dtype: int64

In [18]:
ukbb_icd10ms9ms_temp=pd.read_csv('/temp_project/ukbb/data/i0/varOutcome/ukbb_icd10ms9ms_any_icd_count.csv')

In [21]:
with_any_icd = ukbb_icd10ms9ms_temp[ukbb_icd10ms9ms_temp.any_icd_exist_ind==1]
without_any_icd = ukbb_icd10ms9ms_temp[ukbb_icd10ms9ms_temp.any_icd_exist_ind==0]

with_any_icd.to_csv('/temp_project/ukbb/data/i0/varOutcome/with_any_icd.csv', index=None)
without_any_icd.to_csv('/temp_project/ukbb/data/i0/varOutcome/without_any_icd.csv', index=None)

In [6]:
### ICD code
### ICD10 
## 41202 ICD10 main
df_ind41202_RA=mm_gen_ind_list(fid_in=41202,key_code_list=['M05','M06'],evt='RA')
df_ind41202_lupus=mm_gen_ind_list(fid_in=41202,key_code_list=['M321'],evt='lupus')
df_ind41202_PA=mm_gen_ind_list(fid_in=41202,key_code_list=['L405'],evt='PA')

## 41204 ICD10 second
df_ind41204_RA   = mm_gen_ind_list(fid_in=41204,key_code_list=['M05','M06'],evt='RA')
df_ind41204_lupus= mm_gen_ind_list(fid_in=41204,key_code_list=['M321'],evt='lupus')
df_ind41204_PA   = mm_gen_ind_list(fid_in=41204,key_code_list=['L405'],evt='PA')

## 41203 ICD9 main
df_ind41203_RA=mm_gen_ind_list(fid_in=41203,key_code_list=['7140','7141','7142'],evt='RA')
df_ind41203_lupus=mm_gen_ind_list(fid_in=41203,key_code_list=['7100'],evt='lupus')
df_ind41203_PA=mm_gen_ind_list(fid_in=41203,key_code_list=['6960'],evt='PA')

## 41205 ICD9 second
df_ind41205_RA=mm_gen_ind_list(fid_in=41205,key_code_list=['7140','7141','7142'],evt='RA')
df_ind41205_lupus=mm_gen_ind_list(fid_in=41205,key_code_list=['7100'],evt='lupus')
df_ind41205_PA=mm_gen_ind_list(fid_in=41205,key_code_list=['6960'],evt='PA')


fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAM05 count: 472 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAM06 count: 2048 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  lupusM321 count: 49 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  PAL405 count: 325 ind from 502602
fid 41204 is a single-measure categorical (multiple) variable, which is 
Diagnoses - secondary ICD10Uses data-c

In [7]:
#### Count of related ICDs
## 41202 ICD10 main
ct_ind41202_RA=mm_gen_ind_list(fid_in=41202,key_code_list=['M05','M06'],evt='RA', ct_only=True)
ct_ind41202_lupus=mm_gen_ind_list(fid_in=41202,key_code_list=['M321'],evt='lupus', ct_only=True)
ct_ind41202_PA=mm_gen_ind_list(fid_in=41202,key_code_list=['L405'],evt='PA', ct_only=True)

## 41204 ICD10 second
ct_ind41204_RA   = mm_gen_ind_list(fid_in=41204,key_code_list=['M05','M06'],evt='RA', ct_only=True)
ct_ind41204_lupus= mm_gen_ind_list(fid_in=41204,key_code_list=['M321'],evt='lupus', ct_only=True)
ct_ind41204_PA   = mm_gen_ind_list(fid_in=41204,key_code_list=['L405'],evt='PA', ct_only=True)

## 41203 ICD9 main
ct_ind41203_RA=mm_gen_ind_list(fid_in=41203,key_code_list=['7140','7141','7142'],evt='RA', ct_only=True)
ct_ind41203_lupus=mm_gen_ind_list(fid_in=41203,key_code_list=['7100'],evt='lupus', ct_only=True)
ct_ind41203_PA=mm_gen_ind_list(fid_in=41203,key_code_list=['6960'],evt='PA', ct_only=True)

## 41205 ICD9 second
ct_ind41205_RA=mm_gen_ind_list(fid_in=41205,key_code_list=['7140','7141','7142'],evt='RA', ct_only=True)
ct_ind41205_lupus=mm_gen_ind_list(fid_in=41205,key_code_list=['7100'],evt='lupus', ct_only=True)
ct_ind41205_PA=mm_gen_ind_list(fid_in=41205,key_code_list=['6960'],evt='PA', ct_only=True)


fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAM05 count: 472 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAM06 count: 2048 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  lupusM321 count: 49 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  PAL405 count: 325 ind from 502602
fid 41204 is a single-measure categorical (multiple) variable, which is 
Diagnoses - secondary ICD10Uses data-c

In [8]:
### Create RA/lupus/PA indicators

df_icd_RA_list=[df_ind41202_RA, df_ind41204_RA, df_ind41203_RA, df_ind41205_RA]
df_icd_lupus_list=[df_ind41202_lupus, df_ind41204_lupus, df_ind41203_lupus, df_ind41205_lupus]
df_icd_PA_list=[df_ind41202_PA, df_ind41204_PA, df_ind41203_PA, df_ind41205_PA]

icd_RA_ind_pre=pd.concat(df_icd_RA_list,axis=1)
icd_RA_ind_pre=icd_RA_ind_pre.loc[:,~icd_RA_ind_pre.columns.duplicated()] 
icd_RA_ind=icd_RA_ind_pre[['eid']].copy()
icd_RA_ind['icd_RA_ind']=icd_RA_ind_pre.drop('eid',axis=1).sum(axis=1)
icd_RA_ind.icd_RA_ind=icd_RA_ind.icd_RA_ind.apply(lambda y: 1 if y>0 else y)

icd_lupus_ind_pre=pd.concat(df_icd_lupus_list,axis=1)
icd_lupus_ind_pre=icd_lupus_ind_pre.loc[:,~icd_lupus_ind_pre.columns.duplicated()] 
icd_lupus_ind=icd_lupus_ind_pre[['eid']].copy()
icd_lupus_ind['icd_lupus_ind']=icd_lupus_ind_pre.drop('eid',axis=1).sum(axis=1)
icd_lupus_ind.icd_lupus_ind=icd_lupus_ind.icd_lupus_ind.apply(lambda y: 1 if y>0 else y)

icd_PA_ind_pre=pd.concat(df_icd_PA_list,axis=1)
icd_PA_ind_pre=icd_PA_ind_pre.loc[:,~icd_PA_ind_pre.columns.duplicated()] 
icd_PA_ind=icd_PA_ind_pre[['eid']].copy()
icd_PA_ind['icd_PA_ind']=icd_PA_ind_pre.drop('eid',axis=1).sum(axis=1)
icd_PA_ind.icd_PA_ind=icd_PA_ind.icd_PA_ind.apply(lambda y: 1 if y>0 else y)

print('\nRA freq in ukbb: \n'+str(icd_RA_ind.icd_RA_ind.value_counts()))
print('\nlupus freq in ukbb: \n'+str(icd_lupus_ind.icd_lupus_ind.value_counts()))
print('\nPA freq in ukbb: \n'+str(icd_PA_ind.icd_PA_ind.value_counts()))



RA freq in ukbb: 
0    497087
1      5515
Name: icd_RA_ind, dtype: int64

lupus freq in ukbb: 
0    502520
1        82
Name: icd_lupus_ind, dtype: int64

PA freq in ukbb: 
0    501752
1       850
Name: icd_PA_ind, dtype: int64


In [9]:
### Count RA/lupus/PA events

ct_icd_RA_list=[ct_ind41202_RA, ct_ind41204_RA, ct_ind41203_RA, ct_ind41205_RA]
ct_icd_lupus_list=[ct_ind41202_lupus, ct_ind41204_lupus, ct_ind41203_lupus, ct_ind41205_lupus]
ct_icd_PA_list=[ct_ind41202_PA, ct_ind41204_PA, ct_ind41203_PA, ct_ind41205_PA]

ct_icd_RA_ind_pre=pd.concat(ct_icd_RA_list,axis=1)
ct_icd_RA_ind_pre=ct_icd_RA_ind_pre.loc[:,~ct_icd_RA_ind_pre.columns.duplicated()] 
ct_icd_RA_ind=ct_icd_RA_ind_pre[['eid']].copy()
ct_icd_RA_ind['ct_icd_RA_ind']=ct_icd_RA_ind_pre.drop('eid',axis=1).sum(axis=1)

ct_icd_lupus_ind_pre=pd.concat(ct_icd_lupus_list,axis=1)
ct_icd_lupus_ind_pre=ct_icd_lupus_ind_pre.loc[:,~ct_icd_lupus_ind_pre.columns.duplicated()] 
ct_icd_lupus_ind=ct_icd_lupus_ind_pre[['eid']].copy()
ct_icd_lupus_ind['ct_icd_lupus_ind']=ct_icd_lupus_ind_pre.drop('eid',axis=1).sum(axis=1)

ct_icd_PA_ind_pre=pd.concat(ct_icd_PA_list,axis=1)
ct_icd_PA_ind_pre=ct_icd_PA_ind_pre.loc[:,~ct_icd_PA_ind_pre.columns.duplicated()] 
ct_icd_PA_ind=ct_icd_PA_ind_pre[['eid']].copy()
ct_icd_PA_ind['ct_icd_PA_ind']=ct_icd_PA_ind_pre.drop('eid',axis=1).sum(axis=1)

#print('\nRA ct freq in ukbb: \n'+str(ct_icd_RA_ind.ct_icd_RA_ind.value_counts()))
#print('\nlupus ct freq in ukbb: \n'+str(ct_icd_lupus_ind.ct_icd_lupus_ind.value_counts()))
#print('\nPA ct freq in ukbb: \n'+str(ct_icd_PA_ind.ct_icd_PA_ind.value_counts()))


In [10]:
ukbb_icd_RA_ind_wct = pd.merge(icd_RA_ind, ct_icd_RA_ind, on='eid', how='outer')
ukbb_icd_lupus_ind_wct = pd.merge(icd_lupus_ind, ct_icd_lupus_ind, on='eid', how='outer')
ukbb_icd_PA_ind_wct = pd.merge(icd_PA_ind, ct_icd_PA_ind, on='eid', how='outer')

In [11]:
ukbb_RA_ind_temp1=pd.merge(ukbb_icd_RA_ind_wct, ukbb_icd_lupus_ind_wct, on='eid', how='outer')
ukbb_RAall_ind=pd.merge(ukbb_RA_ind_temp1, ukbb_icd_PA_ind_wct, on='eid', how='outer')



In [25]:
### dataframe to calculate RA score

df_RA_score_temp=ukbb_RAall_ind[ukbb_RAall_ind.ct_icd_RA_ind>0]

### add ukbb ICD count info
ukbb_any_icd_ct=pd.read_csv('/temp_project/ukbb/data/i0/varOutcome/ukbb_icd10ms9ms_any_icd_count.csv')

### dataframe to calculate RA score
df_RA_score_temp2=pd.merge(df_RA_score_temp, ukbb_any_icd_ct, on='eid', how='left')
with_RA_icd=df_RA_score_temp2.copy()



In [29]:

### dataframe to with ct_icd_RA_ind=0
without_RA_icd_temp = ukbb_RAall_ind[ukbb_RAall_ind.ct_icd_RA_ind==0]
without_any_icd_eid_list = without_any_icd.eid.tolist()
without_RA_icd=without_RA_icd[~without_RA_icd.eid.isin(without_any_icd_eid_list)]
without_RA_icd.eid.count()


390301

In [30]:
without_RA_icd.to_csv('/temp_project/ukbb/data/i0/varOutcome/RA/without_RA_icd.csv', index=None)
without_RA_icd.eid.count()

390301

In [16]:
with_RA_icd.to_csv('/temp_project/ukbb/data/i0/varOutcome/RA/with_RA_icd.csv', index=None)

In [17]:
df_RA_score_temp2.head()

Unnamed: 0,eid,icd_RA_ind,ct_icd_RA_ind,icd_lupus_ind,ct_icd_lupus_ind,icd_PA_ind,ct_icd_PA_ind,any_icd_ct,any_icd_exist_ind
0,1001451,1,1,0,0,0,0,57,1
1,1001485,1,1,0,0,0,0,15,1
2,1001564,1,1,0,0,0,0,16,1
3,1001846,1,3,0,0,0,0,35,1
4,1002292,1,11,0,0,0,0,32,1


In [18]:
with_RA_icd=pd.read_csv('/temp_project/ukbb/data/i0/varOutcome/RA/with_RA_icd.csv')

In [19]:
with_RA_icd.count()

eid                  5515
icd_RA_ind           5515
ct_icd_RA_ind        5515
icd_lupus_ind        5515
ct_icd_lupus_ind     5515
icd_PA_ind           5515
ct_icd_PA_ind        5515
any_icd_ct           5515
any_icd_exist_ind    5515
dtype: int64

In [20]:
with_RA_icd.head()

Unnamed: 0,eid,icd_RA_ind,ct_icd_RA_ind,icd_lupus_ind,ct_icd_lupus_ind,icd_PA_ind,ct_icd_PA_ind,any_icd_ct,any_icd_exist_ind
0,1001451,1,1,0,0,0,0,57,1
1,1001485,1,1,0,0,0,0,15,1
2,1001564,1,1,0,0,0,0,16,1
3,1001846,1,3,0,0,0,0,35,1
4,1002292,1,11,0,0,0,0,32,1


In [31]:
### ctrl exclude code:
RAex_icd9_list = ['7200','7202','7208','7209',
                 '7250',
                 '7140','7148','7149','7142','7143',
                 '7141','390','391','392','7144',
                 '7193',
                 '6960',
                 '7100','6954',
                 '7165',
                 '7270']

RAex_icd10_list = ['M45','M461','M4690','M081',
                 'M315','M353',
                 'M05','M06',
                 'M0800','M082','M033','M0840','M089','M088',
                 'M0500',
                 'i00','M120',
                 'M123',
                 'L405',
                 'M070','M073',
                 'M064','M130',
                 'M658','M659']


In [32]:
#### Count of control exclusion ICDs
## 41202 ICD10 main
df_ind41202_RAex=mm_gen_ind_list(fid_in=41202,key_code_list=RAex_icd10_list,evt='RAex')

## 41204 ICD10 second
df_ind41204_RAex=mm_gen_ind_list(fid_in=41204,key_code_list=RAex_icd10_list,evt='RAex')

## 41203 ICD9 main
df_ind41203_RAex=mm_gen_ind_list(fid_in=41203,key_code_list=RAex_icd9_list,evt='RAex')

## 41205 ICD9 second
df_ind41205_RAex=mm_gen_ind_list(fid_in=41205,key_code_list=RAex_icd9_list,evt='RAex')


fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAexM45 count: 198 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAexM461 count: 91 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAexM4690 count: 7 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-coding 19 comprises 19154 String-valued members in a hierarchical tree.
fid 41202  RAexM081 count: 2 ind from 502602
fid 41202 is a single-measure categorical (multiple) variable, which is 
Diagnoses - main ICD10Uses data-codi

In [33]:
df_icd_RAex_list=[df_ind41202_RAex, df_ind41204_RAex, df_ind41203_RAex, df_ind41205_RAex]
icd_RAex_ind_pre=pd.concat(df_icd_RAex_list,axis=1)
icd_RAex_ind_pre=icd_RAex_ind_pre.loc[:,~icd_RAex_ind_pre.columns.duplicated()] 
icd_RAex_ind=icd_RAex_ind_pre[['eid']].copy()

icd_RAex_ind['icd_RAex_ind']=icd_RAex_ind_pre.drop('eid',axis=1).sum(axis=1)

icd_RAex_ind.icd_RAex_ind=icd_RAex_ind.icd_RAex_ind.apply(lambda y: 1 if y>0 else y)


In [39]:
icd_RAex_ind.icd_RAex_ind.value_counts()

0    491456
1     11146
Name: icd_RAex_ind, dtype: int64

In [40]:
### eid list of subjects with RA exclusion code
ukbb_icd_RAex_eid_list=icd_RAex_ind[icd_RAex_ind.icd_RAex_ind==1].eid.tolist()

In [41]:
### eid without any icd code

ukbb_df_without_any_icd_eid_list = ukbb_icd10ms9ms_temp[ukbb_icd10ms9ms_temp.any_icd_exist_ind==0].eid.tolist()
print('ukbb subjects count without any icd code: '+str(len(ukbb_df_without_any_icd_eid_list)))

### eid with any icd code

ukbb_df_with_any_icd_eid_list = ukbb_icd10ms9ms_temp[ukbb_icd10ms9ms_temp.any_icd_exist_ind==1].eid.tolist()
print('ukbb subjects count with any icd code: '+str(len(ukbb_df_with_any_icd_eid_list)))


ukbb_df_with_any_icd = ukbb_RAall_ind[~ukbb_RAall_ind.eid.isin(ukbb_df_without_any_icd_eid_list)].copy()



ukbb_df_without_RAex = ukbb_df_with_any_icd[~ukbb_df_with_any_icd.eid.isin(ukbb_icd_RAex_eid_list)].copy()

print('ukbb subjects count with RA icd code: '+str(len(ukbb_RAall_ind[ukbb_RAall_ind.icd_RA_ind==1])))


ukbb subjects count without any icd code: 106786
ukbb subjects count with any icd code: 395816
ukbb subjects count with RA icd code: 5515


In [42]:
ukbb_df_with_any_icd_wo_RA = ukbb_df_with_any_icd[ukbb_df_with_any_icd.icd_RA_ind==0]

ukbb_df_with_any_icd_wo_RA[ukbb_df_with_any_icd_wo_RA.eid.isin(ukbb_icd_RAex_eid_list)].count()
384670+5631


390301

In [46]:

### save RA control excluded databrame

RA_ctrl_eid_list= ukbb_df_without_RAex.eid.tolist()

RA_ctrl_excluded=without_RA_icd[~without_RA_icd.eid.isin(ctrl_eid_list)]
RA_ctrl_excluded.eid.count()

RA_ctrl_excluded.to_csv('/temp_project/ukbb/data/i0/varOutcome/RA/RA_ctrl_excluded.csv', index=None)

In [38]:
### save RA control databrame

ukbb_df_without_RAex.to_csv('/temp_project/ukbb/data/i0/varOutcome/RA/RA_ctrl.csv', index=None)

In [28]:
ukbb_df_without_RAex.count()

eid                 384670
icd_RA_ind          384670
ct_icd_RA_ind       384670
icd_lupus_ind       384670
ct_icd_lupus_ind    384670
icd_PA_ind          384670
ct_icd_PA_ind       384670
dtype: int64

In [43]:
(9568+9702)/502602*100

3.8340476162052677

In [7]:
df_RA_score_temp3= with_RA_icd.copy()
df_RA_score_temp3['fake_lab_rf1']=np.repeat(1, len(df_RA_score_temp3.eid.unique()))
df_RA_score_temp3['fake_lab_rf0']=np.repeat(0, len(df_RA_score_temp3.eid.unique()))


In [8]:
df_RA_score_temp3['fk1_beta1']=df_RA_score_temp3['ct_icd_RA_ind'].apply(lambda y: 1.937*math.log1p(y))
df_RA_score_temp3['fk1_beta2']=df_RA_score_temp3['ct_icd_lupus_ind'].apply(lambda y: -0.529*math.log1p(y))
df_RA_score_temp3['fk1_beta3']=df_RA_score_temp3['ct_icd_PA_ind'].apply(lambda y: -0.122*math.log1p(y))
df_RA_score_temp3['fk1_beta4']=df_RA_score_temp3['fake_lab_rf1'].apply(lambda y: 1.639*y)
df_RA_score_temp3['fk1_beta5']=df_RA_score_temp3['any_icd_ct'].apply(lambda y: -0.954*math.log1p(y))

df_RA_score_temp3['fk0_beta1']=df_RA_score_temp3['ct_icd_RA_ind'].apply(lambda y: 1.937*math.log1p(y))
df_RA_score_temp3['fk0_beta2']=df_RA_score_temp3['ct_icd_lupus_ind'].apply(lambda y: -0.529*math.log1p(y))
df_RA_score_temp3['fk0_beta3']=df_RA_score_temp3['ct_icd_PA_ind'].apply(lambda y: -0.122*math.log1p(y))
df_RA_score_temp3['fk0_beta4']=df_RA_score_temp3['fake_lab_rf0'].apply(lambda y: 1.639*y)
df_RA_score_temp3['fk0_beta5']=df_RA_score_temp3['any_icd_ct'].apply(lambda y: -0.954*math.log1p(y))

df_RA_score_temp3['fk1_sumbeta']=df_RA_score_temp3['fk1_beta1']+df_RA_score_temp3['fk1_beta2']+df_RA_score_temp3['fk1_beta3']+df_RA_score_temp3['fk1_beta4']+df_RA_score_temp3['fk1_beta5']
df_RA_score_temp3['fk0_sumbeta']=df_RA_score_temp3['fk0_beta1']+df_RA_score_temp3['fk0_beta2']+df_RA_score_temp3['fk0_beta3']+df_RA_score_temp3['fk0_beta4']+df_RA_score_temp3['fk0_beta5']

intercept=-1.017

df_RA_score_temp3['fk1_score']=df_RA_score_temp3['fk1_sumbeta'].apply(lambda y: math.exp(intercept+y)/(1+math.exp(intercept+y)) )
df_RA_score_temp3['fk0_score']=df_RA_score_temp3['fk0_sumbeta'].apply(lambda y: math.exp(intercept+y)/(1+math.exp(intercept+y)) )

In [9]:
fk1_ukbb_RA_case=df_RA_score_temp3[df_RA_score_temp3.fk1_score>=.632]
fk0_ukbb_RA_case=df_RA_score_temp3[df_RA_score_temp3.fk0_score>=.632]

chk_unique_eid(fk1_ukbb_RA_case)
chk_unique_eid(fk0_ukbb_RA_case)

loaded df has unique eid count: 1301
loaded df has unique eid count: 8


## Code stop here, below is testing code

In [168]:
### load HES data

os.chdir("/temp_project/HES")

#load ICD main diagnosis
hesin=pd.read_csv('ukb_hesin.tsv',delimiter='\t',encoding='utf-8')

#load ICD secondary diagnosis
hesin_icd9s=pd.read_csv('ukb_hesin_diag9_wide_vec.tsv',delimiter='\t',encoding='utf-8')
hesin_icd10s=pd.read_csv('ukb_hesin_diag10_wide_vec.tsv',delimiter='\t',encoding='utf-8')

In [172]:
hes_eid=hesin[['eid']].drop_duplicates()

In [174]:
hes_eid.count()

eid    395859
dtype: int64

In [175]:
pd.merge(hes_eid, df_RA_score, on='eid', how='outer')

Unnamed: 0,eid,icd_RA_ind,ct_icd_RA_ind,icd_lupus_ind,ct_icd_lupus_ind,icd_PA_ind,ct_icd_PA_ind
0,1772719,,,,,,
1,1277767,,,,,,
2,2112547,,,,,,
3,2238569,,,,,,
4,2935846,,,,,,
5,3077649,,,,,,
6,3469366,,,,,,
7,2304115,,,,,,
8,4343497,,,,,,
9,5433058,,,,,,
