In [8]:
import os
import json
import random
from datetime import datetime

UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

In [9]:
with open("ref_example_data.json", "r") as f:
    data = json.load(f)

In [10]:
new_data = {}

for pid, record in data.items():
    birthdate = datetime.strptime(record["birthdate"], "%Y-%m-%d")
    
    # sort events by admdate
    events = sorted(record["events"], key=lambda e: datetime.strptime(e["admdate"], "%Y-%m-%d"))

    if len(events) == 0:
        continue

    # pick one event at random
    chosen_event = random.choice(events)
    
    # compute age
    event_date = datetime.strptime(chosen_event["admdate"], "%Y-%m-%d")
    age = (event_date - birthdate).days // 365  # rough age in years
    
    # add new fields
    new_record = {}
    new_record["birth_date"] = record["birthdate"]
    new_record["split_group"] = record["split_group"]
    new_record['attendance_date'] = chosen_event["admdate"]
    is_dead = random.random() < 0.3
    new_record["death_date"] = record["end_of_data"] if is_dead else None

    new_events = []
    for event in events:
        event_date = datetime.strptime(event["admdate"], "%Y-%m-%d")
        age = (event_date - birthdate).days // 365  # rough age in years
        new_events.append({
            "type": "DIAG",
            "codes": event["codes"],
            "diagdate": event['admdate']
        })
    new_record['events'] = new_events

    new_data[pid] = new_record


In [11]:
with open("sample_data_formatted.json", "w") as f:
    json.dump(new_data, f)


{0: '<PAD>',
 1: '<UNK>',
 2: 'A278',
 3: 'A379',
 4: 'A469',
 5: 'A497',
 6: 'B171',
 7: 'B478',
 8: 'B94',
 9: 'C156',
 10: 'C174',
 11: 'C25',
 12: 'C273',
 13: 'E272',
 14: 'E283',
 15: 'E31',
 16: 'E352',
 17: 'E399',
 18: 'E418',
 19: 'E431',
 20: 'E474',
 21: 'E483',
 22: 'F21',
 23: 'F366',
 24: 'F421',
 25: 'F442',
 26: 'G204',
 27: 'G214',
 28: 'G250',
 29: 'G281',
 30: 'G342',
 31: 'G388',
 32: 'G451',
 33: 'G86',
 34: 'H101',
 35: 'H322',
 36: 'H442',
 37: 'H465',
 38: 'I371',
 39: 'I373',
 40: 'I38',
 41: 'I85',
 42: 'L107',
 43: 'L266',
 44: 'L280',
 45: 'L400',
 46: 'L425',
 47: 'L74',
 48: 'M346',
 49: 'M363',
 50: 'M433',
 51: 'M497',
 52: 'M54',
 53: 'N156',
 54: 'N177',
 55: 'N363',
 56: 'N431',
 57: 'N486',
 58: 'N84',
 59: 'N98',
 60: 'O269',
 61: 'O277',
 62: 'O319',
 63: 'O424',
 64: 'O462',
 65: 'P109',
 66: 'P128',
 67: 'P135',
 68: 'P234',
 69: 'P425',
 70: 'Q12',
 71: 'Q167',
 72: 'Q364',
 73: 'Q380',
 74: 'Q487',
 75: 'R129',
 76: 'R143',
 77: 'S108',
 78: '