### Importing CMS data into a database as an example of longitudinal multiple-child data 

In [1]:
! pip install --upgrade pip
! pip install fuzzy_sql-2.0.0b0-py3-none-any.whl

Processing ./fuzzy_sql-2.0.0b0-py3-none-any.whl
fuzzy-sql is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [2]:
import sqlite3
import json
import os
from pathlib import Path

from fuzzy_sql.load import prep_data_for_db, get_vars_to_index, make_table

DATASET_NAME='cms'

In [3]:
# set directories
DATA_DIR=os.path.join(os.getcwd(),'data')
DB_DIR=os.path.join(os.getcwd(),'databases')

metadata_dir = os.path.join(DATA_DIR, DATASET_NAME,'metadata')
real_dir = os.path.join(DATA_DIR,DATASET_NAME, 'real')
syn_dir = os.path.join(DATA_DIR,DATASET_NAME, 'syn')
db_path = os.path.join(DB_DIR, f'{DATASET_NAME}.db')

In [4]:
# identify input data file names
real_csvs=["DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2009_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2010_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_1A.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_1B.csv" ,\
    "DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv",\
    "DE1_0_2008_to_2010_Outpatient_Claims_Sample_1.csv",\
    "DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_1.csv" ]

meta_jsons=['s1_ben_sum_2008.json','s1_ben_sum_2009.json','s1_ben_sum_2010.json','s1_carrier_1a.json',\
    's1_carrier_1b.json','s1_inpatient.json','s1_outpatient.json','s1_prescrp.json']

syn_csvs=["DE1_0_2008_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2009_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2010_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_2A.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_2B.csv" ,\
    "DE1_0_2008_to_2010_Inpatient_Claims_Sample_2.csv",\
    "DE1_0_2008_to_2010_Outpatient_Claims_Sample_2.csv",\
    "DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv" ]

#extracted rows from corresponding tables
rows=[10000,10000,10000,20000,20000,15000,15000,15000]

### Prepare data and import it into the database with indexing


In [5]:
conn = sqlite3.connect(db_path)
with conn:
    for real_csv,meta_json,syn_csv, rows_i in zip(real_csvs,meta_jsons,syn_csvs, rows):
        #Import real
        data,_=prep_data_for_db(os.path.join(real_dir,real_csv),nrows=rows_i) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        with open(os.path.join(metadata_dir, meta_json), 'r') as f:
            metadata=json.load(f)
        candidate_db_idx=get_vars_to_index(metadata,data)
        table_name=Path(meta_json).stem
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)

        #Import syn
        data,_=prep_data_for_db(os.path.join(syn_dir,syn_csv),nrows=rows_i) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        table_name=meta_json.replace("s1","s2")
        table_name=Path(table_name).stem
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)

Table s1_ben_sum_2008 already exists in the database
Table s2_ben_sum_2008 already exists in the database
Table s1_ben_sum_2009 already exists in the database
Table s2_ben_sum_2009 already exists in the database
Table s1_ben_sum_2010 already exists in the database
Table s2_ben_sum_2010 already exists in the database
Table s1_carrier_1a already exists in the database
Table s2_carrier_1a already exists in the database
Table s1_carrier_1b already exists in the database
Table s2_carrier_1b already exists in the database
Table s1_inpatient already exists in the database
Table s2_inpatient already exists in the database
Table s1_outpatient already exists in the database
Table s2_outpatient already exists in the database
Table s1_prescrp already exists in the database
Table s2_prescrp already exists in the database
