In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
files = [str(x) for x in Path("./data").glob("**/diag_201*_new.csv")]

In [5]:
dg = []
for file in files:
    dg.append(pd.read_csv(file))

dg = pd.concat(dg, axis=0, ignore_index=True)

# Pre-processing.

1. In 2010, most Diag follow ICD-9, whereas in 2018, most Diag follow ICD-10. Due to the difference, we add a prefix to indicate which ICD is used.

2. A *mysterious* column named 'Fst_Dt' can draw repetive record based on (Patid, Pat_Planid, Clmid, Diag), which makes no sense, so we need to remove them.

In [42]:
dg_sub = dg[['Patid', 'Pat_Planid', 'Clmid', 'Icd_Flag', 'Diag', 'Diag_Position']]
dg_sub = dg_sub.assign(DiagId = 'ICD_' + dg['Icd_Flag'].astype(str) + '_' + dg['Diag'])
dg_sub.drop_duplicates(inplace=True)

In [77]:
dg_sub.head()

Unnamed: 0,Patid,Pat_Planid,Clmid,Icd_Flag,Diag,Diag_Position,DiagId
0,560499200782112,140519822553544,JF39FJO39F,9,72252,1.0,ICD_9_72252
1,560499200782112,140519822553544,JF39FJO39F,9,7292,2.0,ICD_9_7292
2,560499200782112,140519822553544,JFN3FRR9LJ,9,72252,1.0,ICD_9_72252
3,560499200782112,140519822553544,JFN3FRR9LJ,9,7292,2.0,ICD_9_7292
4,560499200782112,140519822553544,JFO89NL3NR,9,2572,1.0,ICD_9_2572


## Combine Diag within one Clmid together

Each row is a Claim (visit).

In [74]:
# Diag Unique: 16843 in 2010.
PatDetail = dg_sub.groupby(['Patid', 'Pat_Planid', 'Clmid'])['DiagId'].apply(lambda x: ' '.join(x))

In [79]:
PatDetailDF = PatDetail.to_frame()
PatDetailDF = PatDetailDF.reset_index()

In [126]:
PatDetailDF.head()

Unnamed: 0,Patid,Pat_Planid,Clmid,DiagId
0,560499200782112,140519822553544,JF39FJO39F,ICD_9_72252 ICD_9_7292
1,560499200782112,140519822553544,JFN3FRR9LJ,ICD_9_72252 ICD_9_7292
2,560499200782112,140519822553544,JFO89NL3NR,ICD_9_2572
3,560499200782112,140519822553544,JFOR9NFRF3,ICD_9_2572
4,560499200782112,140519822553544,OV9RN3RJJV,ICD_9_7242 ICD_9_7292 ICD_9_7197 ICD_9_72252


## Combine Clmid within one Patid_Planid together

Each row is a Patid_Planid and his/her visiting history.

In [96]:
PatRecord = PatDetailDF.groupby(['Patid', 'Pat_Planid'])['DiagId'].apply(lambda x: ' [SEP] '.join(x))

In [98]:
PatRecordDF = PatRecord.to_frame()
PatRecordDF.reset_index(inplace=True)

In [125]:
PatRecordDF.head()

Unnamed: 0,Patid,Pat_Planid,DiagId
0,560499200782112,140519822553544,ICD_9_72252 ICD_9_7292 [SEP] ICD_9_72252 ICD_9...
1,560499201057366,140519815234940,ICD_9_8409
2,560499201124163,140519842268447,ICD_9_6929 [SEP] ICD_9_6929 [SEP] ICD_9_V7231 ...
3,560499201299620,140519842338011,ICD_9_V700 [SEP] ICD_9_V700 [SEP] ICD_9_V700
4,560499201462177,140519811854342,ICD_9_36803 ICD_9_36721 ICD_9_36731 [SEP] ICD_...


In [105]:
claim2010 = PatRecordDF.DiagId.to_numpy()

np.savetxt('data/claim2010.txt', claim2010, delimiter='\t', fmt='%s')

In [101]:
PatRecord.to_csv('data/claimRecord_2010.csv')

# Hardcode a vocab for Bert.

See [01-training-tokenizers.ipynb](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb#scrollTo=2ed-CMFze9vc) for details.

In [119]:
vocab = {}

for i, v in enumerate(set(dg_sub.DiagId.unique())):
    vocab[v] = i

for j, v in enumerate(['[UNK]', '[SEP]', '[CLS]']):
    vocab[v] = i + j

In [120]:
import json
with open('./Real/vocab.json', 'w') as outfile:
    json.dump(vocab, outfile)