In [10]:
# Set working Directory
import os
os.chdir('/kaggle/input/dataset/csv')

In [None]:
# Download dataset

!wget https://synthea-open-data.s3.amazonaws.com/coherent/coherent-11-07-2022.zip -O "dataset.zip"

In [None]:
!ls 

allergies.csv	encounters.csv	     observations.csv	payer_transitions.csv
careplans.csv	imaging_studies.csv  organizations.csv	procedures.csv
conditions.csv	immunizations.csv    patients.csv	providers.csv
devices.csv	medications.csv      payers.csv		supplies.csv


In [None]:
if not os.path.exists('/kaggle/working/dataset'):
  os.mkdir('/kaggle/working/dataset')

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/kaggle/working/dataset.zip') as zf:
  zip_dir = zf.namelist()[0]
  zf.extractall('/kaggle/working/dataset')

In [None]:
rmdir '/kaggle/working/dataset/fhir'

In [6]:
os.remove('/kaggle/working/dataset.zip')

In [None]:
import os

# Directory path
dir_path = r"/kaggle/working/dataset/dicom"

# List all files in the directory
for filename in os.listdir(dir_path):
    file_path = os.path.join(dir_path, filename)
    
    # Check if it is a file (not a subdirectory)
    if os.path.isfile(file_path):
        os.remove(file_path)  # Remove the file
        print(f"Deleted file: {filename}")

In [12]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/kaggle/input/dataset/csv/patients.csv')

# Show the first few rows
print(df.head())

                                     Id   BIRTHDATE   DEATHDATE          SSN  \
0  8b0484cd-3dbd-8b8d-1b72-a32f74a5a846  1957-04-07         NaN  999-32-2242   
1  b8eb8d31-1031-fb5b-e207-b9815f80744c  1975-08-16         NaN  999-70-2742   
2  ce9bd436-6b59-0452-86a4-61f3642736bc  1945-05-11  2015-04-09  999-54-1330   
3  6fc3e360-ae68-c411-e091-4734df51eb18  1947-12-30         NaN  999-59-9652   
4  ce4ce4d8-d4e2-aca2-5a92-8ce703c5077a  1993-02-05         NaN  999-11-2438   

     DRIVERS    PASSPORT PREFIX        FIRST          LAST SUFFIX  ...  \
0  S99944366  X13210523X    Mr.     Gregg522  Cummerata161    NaN  ...   
1  S99952609  X70704838X    Mr.    Lemuel304   Rodriguez71    NaN  ...   
2  S99979547  X86639992X    Mr.  Leonardo412      Klein929    NaN  ...   
3  S99949959  X34069329X   Mrs.     Adelle43     Dooley940    NaN  ...   
4  S99932651  X80439009X   Mrs.      Veta780    Spencer878    NaN  ...   

                         BIRTHPLACE                   ADDRESS         CITY

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [13]:
import csv
import json

def csv_to_json(csv_file_path, json_file_path):
    data = []
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)  # Automatically uses headers as keys
        for row in reader:
            data.append(row)

    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage
csv_to_json('/kaggle/input/dataset/csv/patients.csv', '/kaggle/working/output.json')

In [14]:
import csv
import json

def csv_to_json_with_id_as_key(csv_file_path, json_file_path):
    data = {}
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            record_id = row.pop('Id')  # Take out 'Id' and use as key
            data[record_id] = row

    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage
csv_to_json_with_id_as_key('/kaggle/input/dataset/csv/patients.csv', '/kaggle/working/output.json')


In [16]:
import csv
import json

def csv_to_json_with_patient_details(csv_file_path, json_file_path):
    data = {}
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            record_id = row.pop('Id')  # Extract the 'Id' key
            data[record_id] = {"patient_details": row}  # Wrap patient info

    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage
csv_to_json_with_patient_details('/kaggle/input/dataset/csv/patients.csv', '/kaggle/working/patient_details_output.json')


In [1]:
import csv
import json

def combine_patient_and_allergies(patient_csv, allergies_csv, output_json):
    patients = {}
    
    # Load patient details
    with open(patient_csv, mode='r', newline='', encoding='utf-8') as pfile:
        reader = csv.DictReader(pfile)
        for row in reader:
            patient_id = row.pop('Id')
            patients[patient_id] = {
                "patient_details": row,
                "allergies": []
            }

    # Load allergies and attach to patients
    with open(allergies_csv, mode='r', newline='', encoding='utf-8') as afile:
        reader = csv.DictReader(afile)
        for row in reader:
            patient_id = row.pop('PATIENT')
            if patient_id in patients:
                patients[patient_id]["allergies"].append(row)
            else:
                # Optional: handle if allergy refers to unknown patient
                pass

    # Save to output JSON
    with open(output_json, mode='w', encoding='utf-8') as outfile:
        json.dump(patients, outfile, indent=4)

# Example usage
combine_patient_and_allergies('/kaggle/input/dataset/csv/patients.csv', '/kaggle/input/dataset/csv/allergies.csv', 'patient_allergies.json')


In [2]:
import csv
import json

def combine_patient_allergies_careplans(patient_csv, allergies_csv, careplans_csv, output_json):
    patients = {}
    
    # Load patient details
    with open(patient_csv, mode='r', newline='', encoding='utf-8') as pfile:
        reader = csv.DictReader(pfile)
        for row in reader:
            patient_id = row['Id']   # Do NOT pop
            patients[patient_id] = {
                "patient_details": row,  # Keep full row including Id
                "allergies": [],
                "careplans": []
            }

    # Load allergies and attach to patients
    with open(allergies_csv, mode='r', newline='', encoding='utf-8') as afile:
        reader = csv.DictReader(afile)
        for row in reader:
            patient_id = row['PATIENT']  # Do NOT pop
            if patient_id in patients:
                patients[patient_id]["allergies"].append(row)

    # Load careplans and attach to patients
    with open(careplans_csv, mode='r', newline='', encoding='utf-8') as cfile:
        reader = csv.DictReader(cfile)
        for row in reader:
            patient_id = row['PATIENT']  # Do NOT pop
            if patient_id in patients:
                patients[patient_id]["careplans"].append(row)

    # Save to output JSON
    with open(output_json, mode='w', encoding='utf-8') as outfile:
        json.dump(patients, outfile, indent=4)

# Example usage
combine_patient_allergies_careplans('/kaggle/input/dataset/csv/patients.csv', '/kaggle/input/dataset/csv/allergies.csv', '/kaggle/input/dataset/csv/careplans.csv', 'patients_allergies_careplans.json')


In [2]:
import csv
import json

patient_csv = '/kaggle/input/dataset/csv/patients.csv'
allergies_csv = '/kaggle/input/dataset/csv/allergies.csv'
careplans_csv = '/kaggle/input/dataset/csv/careplans.csv'
conditions_csv = '/kaggle/input/dataset/csv/conditions.csv'
devices_csv = '/kaggle/input/dataset/csv/devices.csv'
encounters_csv = '/kaggle/input/dataset/csv/encounters.csv'
imaging_studies_csv = '/kaggle/input/dataset/csv/imaging_studies.csv'
immunizations_csv = '/kaggle/input/dataset/csv/immunizations.csv'
medications_csv = '/kaggle/input/dataset/csv/medications.csv'
observations_csv = '/kaggle/input/dataset/csv/observations.csv'
#organizations_csv = '/kaggle/input/dataset/csv/organizations.csv'
payer_transitions_csv = '/kaggle/input/dataset/csv/payer_transitions.csv'
#payers_csv = '/kaggle/input/dataset/csv/payers.csv'
procedures_csv = '/kaggle/input/dataset/csv/procedures.csv'
#providers_csv = '/kaggle/input/dataset/csv/providers.csv'
#supplies_csv = '/kaggle/input/dataset/csv/supplies.csv'

output_json = 'final_output.json'

patients = {}
    
with open(patient_csv, mode='r', newline='', encoding='utf-8') as pfile:
    reader = csv.DictReader(pfile)
    for row in reader:
        patient_id = row['Id']
        patients[patient_id] = {
            "patient_details": row,
            "allergies": [],
            "careplans": [],
            "conditions" : [],
            "devices" : [],
            "encounters" : [],
            "imaging_studies" : [],
            "immunizations" : [],
            "medications" : [],
            "observations" : [],
            "payer_transitions" : [],
            "procedures" : []
        }

#allergies_csv
with open(allergies_csv, mode='r', newline='', encoding='utf-8') as afile:
    reader = csv.DictReader(afile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["allergies"].append(row)

#careplans_csv
with open(careplans_csv, mode='r', newline='', encoding='utf-8') as bfile:
    reader = csv.DictReader(bfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["careplans"].append(row)

#conditions_csv
with open(conditions_csv, mode='r', newline='', encoding='utf-8') as cfile:
    reader = csv.DictReader(cfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["conditions"].append(row)

#devices_csv
with open(devices_csv, mode='r', newline='', encoding='utf-8') as dfile:
    reader = csv.DictReader(dfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["devices"].append(row)

#encounters_csv
with open(encounters_csv, mode='r', newline='', encoding='utf-8') as efile:
    reader = csv.DictReader(efile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["encounters"].append(row)

#imaging_studies_csv
with open(imaging_studies_csv, mode='r', newline='', encoding='utf-8') as ffile:
    reader = csv.DictReader(ffile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["imaging_studies"].append(row)

#immunizations_csv
with open(immunizations_csv, mode='r', newline='', encoding='utf-8') as gfile:
    reader = csv.DictReader(gfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["immunizations"].append(row)

#medications_csv
with open(medications_csv, mode='r', newline='', encoding='utf-8') as hfile:
    reader = csv.DictReader(hfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["medications"].append(row)

#observations_csv
with open(observations_csv, mode='r', newline='', encoding='utf-8') as ifile:
    reader = csv.DictReader(ifile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["observations"].append(row)

#payer_transitions_csv
with open(payer_transitions_csv, mode='r', newline='', encoding='utf-8') as jfile:
    reader = csv.DictReader(jfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["payer_transitions"].append(row)

#procedures_csv
with open(procedures_csv, mode='r', newline='', encoding='utf-8') as kfile:
    reader = csv.DictReader(kfile)
    for row in reader:
        patient_id = row['PATIENT']
        if patient_id in patients:
            patients[patient_id]["procedures"].append(row)
            
with open(output_json, mode='w', encoding='utf-8') as outfile:
    json.dump(patients, outfile, indent=4)