# HES data preprocessing

Add secondary ICD9/10 to main data (hesin) 

Add baseline date from ukbb data

Add visit count for HES data

In [7]:
import os
import pandas as pd
import numpy as np

os.chdir('/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/all_codes/')
from ukbb_ldbf import load_data_by_fid

### Add secondary ICD9/10 to main data (hesin) 

In [8]:
os.chdir("/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/HES")

#load ICD main diagnosis
hesin=pd.read_csv('ukb_hesin.tsv',delimiter='\t',encoding='utf-8')

#load ICD secondary diagnosis
hesin_icd9s=pd.read_csv('ukb_hesin_diag9_wide_vec.tsv',delimiter='\t',encoding='utf-8')
hesin_icd10s=pd.read_csv('ukb_hesin_diag10_wide_vec.tsv',delimiter='\t',encoding='utf-8')

%time hesin.head()
print('hesin records count: '+ str(hesin.eid.count()))
print('hesin unique eid count: '+ str(len(hesin.eid.unique())))

%time hesin_icd9s.head()
print('hesin_icd9s records count: '+ str(hesin_icd9s.eid.count()))
print('hesin_icd9s unique eid count: '+ str(len(hesin_icd9s.eid.unique())))

%time hesin_icd10s.head()
print('hesin_icd10s records count: '+ str(hesin_icd10s.eid.count()))
print('hesin_icd10s unique eid count: '+ str(len(hesin_icd10s.eid.unique())))

### merge 9s / 10s to hesin
hesin_add_sec9=pd.merge(hesin, hesin_icd9s, on=['eid','record_id'], how='outer')
hesin_add_sec910=pd.merge(hesin_add_sec9, hesin_icd10s, on=['eid','record_id'], how='outer')
hesin_add_sec910.to_csv('hesin_add_sec910.tsv', index=None, sep='\t')


print('main diag count: '+str(hesin.eid.count()))
print('secondary icd 9 diag count: '+str(hesin_icd9s.eid.count()))
print('secondary icd 10 diag count: '+str(hesin_icd10s.eid.count()))
hesin_icd10s.head()
hesin_icd9s.head()
hesin_add_sec910.columns.tolist()

CPU times: user 1e+03 µs, sys: 0 ns, total: 1e+03 µs
Wall time: 303 µs
hesin records count: 2577597
hesin unique eid count: 395859
CPU times: user 1 ms, sys: 0 ns, total: 1 ms
Wall time: 247 µs
hesin_icd9s records count: 14808
hesin_icd9s unique eid count: 8716
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 146 µs
hesin_icd10s records count: 1694827
hesin_icd10s unique eid count: 320334
main diag count: 2577597
secondary icd 9 diag count: 14808
secondary icd 10 diag count: 1694827


['eid',
 'record_id',
 'admidate',
 'anagest',
 'anasdate',
 'cause_icd10',
 'cause_icd10_nb',
 'diag_icd10',
 'diag_icd10_nb',
 'diag_icd9',
 'diag_icd9_nb',
 'disdate',
 'epiend',
 'epistart',
 'matage',
 'neocare',
 'numbaby',
 'numpreg',
 'opdate',
 'oper4',
 'oper4_nb',
 'operstat',
 'diag_icd9_sec',
 'diag_icd9_nb_sec',
 'diag_icd10_sec',
 'diag_icd10_nb_sec']

### Get date info 


[x] Create date variable 'any_date' for HES (any_yr = the year at any_date)

[x] Add baseline date from ukbb data (att_date= the date the subject went to assessment center, att_yr= the year at att_date)

[x] Create year difference variable: diff_yr= any_yr - att_yr


In [30]:
pwd()

'/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/ukbb/data/i0/var_dat'

In [10]:
### Create date variable for HES


hesin_date=hesin[['eid','record_id','admidate','anasdate','disdate','epiend','epistart','opdate']].copy()
hesin_date['admidate'] = pd.to_datetime(hesin_date['admidate'])
hesin_date['anasdate'] = pd.to_datetime(hesin_date['anasdate'])
hesin_date['disdate'] = pd.to_datetime(hesin_date['disdate'])
hesin_date['epiend'] = pd.to_datetime(hesin_date['epiend'])
hesin_date['epistart'] = pd.to_datetime(hesin_date['epistart'])
hesin_date['opdate'] = pd.to_datetime(hesin_date['opdate'])

hesin_date['any_date']=hesin_date.drop(['eid','record_id'],axis=1).min(axis=1)
hesin_date['any_yr']=hesin_date['any_date'].dt.year
print('record count: '+str(hesin_date.eid.count()))
print('any_date count: '+str(hesin_date.any_date.count()))

hesin_date.to_csv('hesin_date.tsv', sep='\t', index=None)

record count: 2577597
any_date count: 2577595


In [12]:
### Add baseline date from ukbb data

## 53 Date info for ukbb
df53 = load_data_by_fid(53)

df53.columns=['eid','att_date']
df53['att_date']=pd.to_datetime(df53['att_date'])
df53['att_yr']=df53['att_date'].dt.year

df53.to_csv('/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/ukbb/data/dfoi/ukbb_date.csv', index=None)


ukbb_date = pd.read_csv('/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/ukbb/data/dfoi/ukbb_date.csv')
ukbb_date['att_date'] = pd.to_datetime(ukbb_date['att_date'])


hesin_ukbb_date = pd.merge(hesin_date, ukbb_date, on='eid', how='left').copy()
hesin_ukbb_date_merge=hesin_ukbb_date[['eid','record_id','any_date', 'any_yr', 'att_date', 'att_yr']].copy()



fid 53 is a single-measure date variable, which is 
Date of attending assessment centre


In [31]:
### reset working directory
os.chdir("/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/HES")

In [13]:
### Create year difference variable: diff_yr= any_yr - att_yr
### classified into 3 categories: 
### 1= pre baseline
### 2= at baseline
### 3= post baseline


hesin_ukbb_date_merge['diff_yr']= hesin_ukbb_date_merge['any_yr'] - hesin_ukbb_date_merge['att_yr']
hesin_ukbb_date_merge['diff_yr_cat']=hesin_ukbb_date_merge['diff_yr'].apply(lambda y: 1 if y<0 else (2 if y==0 else (3 if str(y)!='nan' else np.nan)))


### Add ICD count info

[x] Add icd exact count variable 'icd_exact_ct_diag_icd9msicd10ms': count how many icd exact reported per record

[x] Add icd exist indicator variable 'icd_exist_ind_diag_icd9msicd10ms': indicates if record had at least one icd code 


In [14]:
### Add icd exact count variable: count how many icd exact reported per record

hesin_allicd= hesin_add_sec910.copy()

hesin_allicd['icd_exact_ct_diag_icd10m'] = hesin_allicd['diag_icd10'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
hesin_allicd['icd_exact_ct_diag_icd10s'] = hesin_allicd['diag_icd10_sec'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
hesin_allicd['icd_exact_ct_diag_icd9m'] = hesin_allicd['diag_icd9'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))
hesin_allicd['icd_exact_ct_diag_icd9s'] = hesin_allicd['diag_icd9_sec'].astype(str).apply(lambda y: 0 if y=='nan' else len(str(y).split(',')))

hesin_allicd['icd_exact_ct_diag_icd9msicd10ms'] = hesin_allicd['icd_exact_ct_diag_icd10m']+hesin_allicd['icd_exact_ct_diag_icd10s']+hesin_allicd['icd_exact_ct_diag_icd9m']+hesin_allicd['icd_exact_ct_diag_icd9s']
### Add icd exist indicator variable: indicates if record had at least one icd code
hesin_allicd['icd_exist_ind_diag_icd9msicd10ms']= hesin_allicd['icd_exact_ct_diag_icd9msicd10ms'].apply(lambda y: 1 if y>0 else 0)


In [15]:
hesin_allicd.columns

Index(['eid', 'record_id', 'admidate', 'anagest', 'anasdate', 'cause_icd10',
       'cause_icd10_nb', 'diag_icd10', 'diag_icd10_nb', 'diag_icd9',
       'diag_icd9_nb', 'disdate', 'epiend', 'epistart', 'matage', 'neocare',
       'numbaby', 'numpreg', 'opdate', 'oper4', 'oper4_nb', 'operstat',
       'diag_icd9_sec', 'diag_icd9_nb_sec', 'diag_icd10_sec',
       'diag_icd10_nb_sec', 'icd_exact_ct_diag_icd10m',
       'icd_exact_ct_diag_icd10s', 'icd_exact_ct_diag_icd9m',
       'icd_exact_ct_diag_icd9s', 'icd_exact_ct_diag_icd9msicd10ms',
       'icd_exist_ind_diag_icd9msicd10ms'],
      dtype='object')

In [16]:
hesin_ukbb_date_merge.columns

Index(['eid', 'record_id', 'any_date', 'any_yr', 'att_date', 'att_yr',
       'diff_yr', 'diff_yr_cat'],
      dtype='object')

In [17]:
### merge date and count dataframe

hesin_ukbb_date_icd_count_merged=pd.merge(hesin_allicd, hesin_ukbb_date_merge, on=['eid','record_id'], how='outer')

In [18]:
hesin_ukbb_date_icd_count_merged.eid.count()
hesin_ukbb_date_icd_count_merged.columns

Index(['eid', 'record_id', 'admidate', 'anagest', 'anasdate', 'cause_icd10',
       'cause_icd10_nb', 'diag_icd10', 'diag_icd10_nb', 'diag_icd9',
       'diag_icd9_nb', 'disdate', 'epiend', 'epistart', 'matage', 'neocare',
       'numbaby', 'numpreg', 'opdate', 'oper4', 'oper4_nb', 'operstat',
       'diag_icd9_sec', 'diag_icd9_nb_sec', 'diag_icd10_sec',
       'diag_icd10_nb_sec', 'icd_exact_ct_diag_icd10m',
       'icd_exact_ct_diag_icd10s', 'icd_exact_ct_diag_icd9m',
       'icd_exact_ct_diag_icd9s', 'icd_exact_ct_diag_icd9msicd10ms',
       'icd_exist_ind_diag_icd9msicd10ms', 'any_date', 'any_yr', 'att_date',
       'att_yr', 'diff_yr', 'diff_yr_cat'],
      dtype='object')

In [19]:
hesin_ukbb_date_icd_count_temp = hesin_ukbb_date_icd_count_merged[['eid', 'record_id',  'icd_exact_ct_diag_icd9msicd10ms',
       'icd_exist_ind_diag_icd9msicd10ms', 'any_date', 'any_yr', 'att_date',
       'att_yr', 'diff_yr', 'diff_yr_cat']].dropna().copy()

In [20]:
### Overall visit/icd count

### Get count of visit with icd reported in hes: hes_icd_reported_ct
hesin_ukbb_date_icd_count_temp1 = hesin_ukbb_date_icd_count_temp[['eid', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()
hesin_ukbb_date_icd_count_temp1_0 = hesin_ukbb_date_icd_count_temp1.set_index(['eid'])
hesin_ukbb_date_icd_count_temp1_1 = hesin_ukbb_date_icd_count_temp1_0.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp1_to_merge = hesin_ukbb_date_icd_count_temp1_1.reset_index()
hesin_ukbb_date_icd_count_temp1_to_merge.columns = ['eid', 'hes_icd_reported_ct']

### Get count of icd exact reported in hes: hes_icd_exact_ct 
hesin_ukbb_date_icd_count_temp2 = hesin_ukbb_date_icd_count_temp[['eid' ,'icd_exact_ct_diag_icd9msicd10ms']].copy()
hesin_ukbb_date_icd_count_temp3 = hesin_ukbb_date_icd_count_temp2.set_index(['eid'])
hesin_ukbb_date_icd_count_temp4 = hesin_ukbb_date_icd_count_temp3.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp4_to_merge = hesin_ukbb_date_icd_count_temp4.reset_index()
hesin_ukbb_date_icd_count_temp4_to_merge.columns = ['eid', 'hes_icd_exact_ct']

hesin_ukbb_date_icd_count_temp14_to_merge = pd.merge(hesin_ukbb_date_icd_count_temp1_to_merge, hesin_ukbb_date_icd_count_temp4_to_merge, on=['eid'], how='outer')


In [21]:
### pre baseline visit/icd count
hesin_ukbb_date_icd_count_temp0_pre = hesin_ukbb_date_icd_count_temp[hesin_ukbb_date_icd_count_temp.diff_yr_cat==1][['eid', 'icd_exact_ct_diag_icd9msicd10ms', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()

### Get pre baseline count of visit with icd reported in hes: hes_icd_reported_ct
hesin_ukbb_date_icd_count_temp1_pre = hesin_ukbb_date_icd_count_temp0_pre[['eid', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()
hesin_ukbb_date_icd_count_temp1_0_pre = hesin_ukbb_date_icd_count_temp1_pre.set_index(['eid'])
hesin_ukbb_date_icd_count_temp1_1_pre = hesin_ukbb_date_icd_count_temp1_0_pre.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp1_pre_to_merge = hesin_ukbb_date_icd_count_temp1_1_pre.reset_index()
hesin_ukbb_date_icd_count_temp1_pre_to_merge.columns = ['eid', 'hes_icd_reported_ct_pre']

### Get count of icd exact reported in hes: hes_icd_exact_ct 
hesin_ukbb_date_icd_count_temp2_pre = hesin_ukbb_date_icd_count_temp0_pre[['eid' ,'icd_exact_ct_diag_icd9msicd10ms']].copy()
hesin_ukbb_date_icd_count_temp3_pre = hesin_ukbb_date_icd_count_temp2_pre.set_index(['eid'])
hesin_ukbb_date_icd_count_temp4_pre = hesin_ukbb_date_icd_count_temp3_pre.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp4_pre_to_merge = hesin_ukbb_date_icd_count_temp4_pre.reset_index()
hesin_ukbb_date_icd_count_temp4_pre_to_merge.columns = ['eid', 'hes_icd_exact_ct_pre']

hesin_ukbb_date_icd_count_temp14_pre_to_merge = pd.merge(hesin_ukbb_date_icd_count_temp1_pre_to_merge, hesin_ukbb_date_icd_count_temp4_pre_to_merge, on=['eid'], how='outer')






### at baseline visit/icd count
hesin_ukbb_date_icd_count_temp0_at = hesin_ukbb_date_icd_count_temp[hesin_ukbb_date_icd_count_temp.diff_yr_cat==2][['eid', 'icd_exact_ct_diag_icd9msicd10ms', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()

### Get at baseline count of visit with icd reported in hes: hes_icd_reported_ct
hesin_ukbb_date_icd_count_temp1_at = hesin_ukbb_date_icd_count_temp0_at[['eid', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()
hesin_ukbb_date_icd_count_temp1_0_at = hesin_ukbb_date_icd_count_temp1_at.set_index(['eid'])
hesin_ukbb_date_icd_count_temp1_1_at = hesin_ukbb_date_icd_count_temp1_0_at.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp1_at_to_merge = hesin_ukbb_date_icd_count_temp1_1_at.reset_index()
hesin_ukbb_date_icd_count_temp1_at_to_merge.columns = ['eid', 'hes_icd_reported_ct_at']

### Get count of icd exact reported in hes: hes_icd_exact_ct 
hesin_ukbb_date_icd_count_temp2_at = hesin_ukbb_date_icd_count_temp0_at[['eid' ,'icd_exact_ct_diag_icd9msicd10ms']].copy()
hesin_ukbb_date_icd_count_temp3_at = hesin_ukbb_date_icd_count_temp2_at.set_index(['eid'])
hesin_ukbb_date_icd_count_temp4_at = hesin_ukbb_date_icd_count_temp3_at.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp4_at_to_merge = hesin_ukbb_date_icd_count_temp4_at.reset_index()
hesin_ukbb_date_icd_count_temp4_at_to_merge.columns = ['eid', 'hes_icd_exact_ct_at']

hesin_ukbb_date_icd_count_temp14_at_to_merge = pd.merge(hesin_ukbb_date_icd_count_temp1_at_to_merge, hesin_ukbb_date_icd_count_temp4_at_to_merge, on=['eid'], how='outer')






### post baseline visit/icd count
hesin_ukbb_date_icd_count_temp0_post = hesin_ukbb_date_icd_count_temp[hesin_ukbb_date_icd_count_temp.diff_yr_cat==3][['eid', 'icd_exact_ct_diag_icd9msicd10ms', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()

### Get post baseline count of visit with icd reported in hes: hes_icd_reported_ct
hesin_ukbb_date_icd_count_temp1_post = hesin_ukbb_date_icd_count_temp0_post[['eid', 'icd_exist_ind_diag_icd9msicd10ms']].sort_values(by=['eid']).copy()
hesin_ukbb_date_icd_count_temp1_0_post = hesin_ukbb_date_icd_count_temp1_post.set_index(['eid'])
hesin_ukbb_date_icd_count_temp1_1_post = hesin_ukbb_date_icd_count_temp1_0_post.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp1_post_to_merge = hesin_ukbb_date_icd_count_temp1_1_post.reset_index()
hesin_ukbb_date_icd_count_temp1_post_to_merge.columns = ['eid', 'hes_icd_reported_ct_post']

### Get count of icd exact reported in hes: hes_icd_exact_ct 
hesin_ukbb_date_icd_count_temp2_post = hesin_ukbb_date_icd_count_temp0_post[['eid' ,'icd_exact_ct_diag_icd9msicd10ms']].copy()
hesin_ukbb_date_icd_count_temp3_post = hesin_ukbb_date_icd_count_temp2_post.set_index(['eid'])
hesin_ukbb_date_icd_count_temp4_post = hesin_ukbb_date_icd_count_temp3_post.groupby(by=['eid']).sum().groupby(level=[0]).cumsum()
hesin_ukbb_date_icd_count_temp4_post_to_merge = hesin_ukbb_date_icd_count_temp4_post.reset_index()
hesin_ukbb_date_icd_count_temp4_post_to_merge.columns = ['eid', 'hes_icd_exact_ct_post']

hesin_ukbb_date_icd_count_temp14_post_to_merge = pd.merge(hesin_ukbb_date_icd_count_temp1_post_to_merge, hesin_ukbb_date_icd_count_temp4_post_to_merge, on=['eid'], how='outer')


In [22]:
### Add icd count detail: overall + pre/at/post baseline  icd reported/exact count 
hesin_ukbb_date_icd_count_temp_merge1 = hesin_ukbb_date_icd_count_temp[['eid','record_id']].copy()

hesin_ukbb_date_icd_count_temp_merge2 = pd.merge(hesin_ukbb_date_icd_count_temp_merge1, hesin_ukbb_date_icd_count_temp14_to_merge, on=['eid'], how='outer')
hesin_ukbb_date_icd_count_temp_merge3 = pd.merge(hesin_ukbb_date_icd_count_temp_merge2, hesin_ukbb_date_icd_count_temp14_pre_to_merge, on=['eid'], how='outer')
hesin_ukbb_date_icd_count_temp_merge4 = pd.merge(hesin_ukbb_date_icd_count_temp_merge3, hesin_ukbb_date_icd_count_temp14_at_to_merge, on=['eid'], how='outer')
hesin_ukbb_date_icd_count_temp_merge5 = pd.merge(hesin_ukbb_date_icd_count_temp_merge4, hesin_ukbb_date_icd_count_temp14_post_to_merge, on=['eid'], how='outer')


In [23]:
hesin_ukbb_date_icd_count_ready = pd.merge(hesin_ukbb_date_icd_count_merged, hesin_ukbb_date_icd_count_temp_merge5, on=['eid','record_id'], how='outer')

In [24]:
hesin_ukbb_date_icd_count_ready.columns

Index(['eid', 'record_id', 'admidate', 'anagest', 'anasdate', 'cause_icd10',
       'cause_icd10_nb', 'diag_icd10', 'diag_icd10_nb', 'diag_icd9',
       'diag_icd9_nb', 'disdate', 'epiend', 'epistart', 'matage', 'neocare',
       'numbaby', 'numpreg', 'opdate', 'oper4', 'oper4_nb', 'operstat',
       'diag_icd9_sec', 'diag_icd9_nb_sec', 'diag_icd10_sec',
       'diag_icd10_nb_sec', 'icd_exact_ct_diag_icd10m',
       'icd_exact_ct_diag_icd10s', 'icd_exact_ct_diag_icd9m',
       'icd_exact_ct_diag_icd9s', 'icd_exact_ct_diag_icd9msicd10ms',
       'icd_exist_ind_diag_icd9msicd10ms', 'any_date', 'any_yr', 'att_date',
       'att_yr', 'diff_yr', 'diff_yr_cat', 'hes_icd_reported_ct',
       'hes_icd_exact_ct', 'hes_icd_reported_ct_pre', 'hes_icd_exact_ct_pre',
       'hes_icd_reported_ct_at', 'hes_icd_exact_ct_at',
       'hes_icd_reported_ct_post', 'hes_icd_exact_ct_post'],
      dtype='object')

In [25]:
hesin_ukbb_date_icd_count_ready.head()

Unnamed: 0,eid,record_id,admidate,anagest,anasdate,cause_icd10,cause_icd10_nb,diag_icd10,diag_icd10_nb,diag_icd9,...,diff_yr,diff_yr_cat,hes_icd_reported_ct,hes_icd_exact_ct,hes_icd_reported_ct_pre,hes_icd_exact_ct_pre,hes_icd_reported_ct_at,hes_icd_exact_ct_at,hes_icd_reported_ct_post,hes_icd_exact_ct_post
0,1772719,1071463,2003-05-15,,,,,R198,,,...,-6.0,1.0,3.0,5.0,3.0,5.0,,,,
1,1772719,1077874,2003-06-05,,,,,R104,,,...,-6.0,1.0,3.0,5.0,3.0,5.0,,,,
2,1772719,1127881,2000-05-01,,,,,M512,,,...,-9.0,1.0,3.0,5.0,3.0,5.0,,,,
3,1277767,3208109,,,,,,M8414,,,...,-3.0,1.0,2.0,2.0,2.0,2.0,,,,
4,1277767,4218015,,,,,,M8414,,,...,-3.0,1.0,2.0,2.0,2.0,2.0,,,,


In [29]:
### save preprocessed hesin data with ukbb baseline and icd count info

hesin_ukbb_date_icd_count_ready.to_csv('/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/HES/hesin_ukbb_date_icd_count_ready.tsv', index=None, sep='\t')

In [32]:
pwd()

'/projects/ps-janssen3/dsci-pa/yhuan162/temp_project/HES'

In [35]:
chk1=pd.read_csv('ukb_hesin.tsv',sep='\t')
chk2=pd.read_csv('ukb_hesin_diag9.tsv',sep='\t')
chk3=pd.read_csv('ukb_hesin_diag10.tsv',sep='\t')
chk4=pd.read_csv('ukb_hesin_oper.tsv',sep='\t')
chk5=pd.read_csv('ukb_hesin_birth.tsv',sep='\t')

In [45]:
len(chk1.eid.unique())

395859