In [1]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1
import pandas as pd
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/spfohl/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-nigam-starr'

In [2]:
param_dict = {
    'schema': 'starr_omop_cdm5_deid_20200404',
    'rs_schema': 'plp_cohort_tables',
    'cohort_name' : 'admission_rollup_20200404_with_labels_sampled',
    'index_date' : 'cohort_start_date',
    'limit_str': 'LIMIT 1000',
    'row_id': 'prediction_id'
}

In [3]:
table_keys_dict = {
    'condition_occurrence': {
        'concept_id': 'condition_concept_id',
        'concept_date': 'condition_start_date',
        'source_table': 'condition_occurrence',
        'analysis_id': 'condition_occurrence'
    },
    'drug_exposure': {
        'concept_id': 'drug_concept_id',
        'concept_date': 'drug_exposure_start_date',
        'source_table': 'drug_exposure',
        'analysis_id': 'drug_exposure'
    },
    'device_exposure': {
        'concept_id': 'device_concept_id',
        'concept_date': 'device_exposure_start_date',
        'source_table': 'device_exposure',
        'analysis_id': 'device_exposure'
    },
    'measurement': {
        'concept_id': 'measurement_concept_id',
        'concept_date': 'measurement_date',
        'source_table': 'measurement',
        'analysis_id': 'measurement'
    },
    'procedure_occurrence': {
        'concept_id': 'procedure_concept_id',
        'concept_date': 'procedure_date',
        'source_table': 'procedure_occurrence',
        'analysis_id': 'procedure_occurrence'
    },
    'visit_occurrence': {
        'concept_id': 'visit_concept_id',
        'concept_date': 'visit_start_date',
        'source_table': 'visit_occurrence',
        'analysis_id': 'visit_occurrence'
    },
    'note': {
        'concept_id': 'note_type_concept_id',
        'concept_date': 'note_date',
        'source_table': 'note',
        'analysis_id': 'note_type'
    },
    'observation': {
        'concept_id': 'observation_concept_id',
        'concept_date': 'observation_date',
        'source_table': 'observation',
        'analysis_id': 'observation'
    }
}

In [86]:
# Parameterized query that grabs all standard concept occurrences

concept_query = """
    SELECT 
        {row_id},
        person_id,
        index_date,
        concept_id,
        concept_date,
        analysis_id
    FROM (
        SELECT 
            {row_id},
            t1.person_id,
            CAST(t2.{index_date} AS DATE) as index_date, 
            {concept_id} AS concept_id, 
            CAST({concept_date} AS DATE) AS concept_date,
            '{analysis_id}' AS analysis_id
        FROM {schema}.{source_table} t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.{concept_id} = t3.concept_id
        WHERE 
            CAST({concept_date} AS DATE) < CAST(t2.{index_date} AS DATE) 
            AND standard_concept = 'S'
        {limit_str}
    )
"""

concept_query_dict = {
    key: concept_query.format_map({**param_dict, **value})
    for key, value in table_keys_dict.items()
}

In [87]:
query = query_dict['note']
df = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2528.94rows/s]


In [None]:
result_dict = {
    key: pd.read_gbq(query, dialect='standard')
    for key, query in query_dict.items()
}

In [91]:
# Parameterized query that grabs only unique concept occurrences
distinct_concept_query = """
    SELECT 
    {row_id},
    {concept_id} AS concept_id,
    '{analysis_id}' AS analysis_id
    FROM (
        SELECT 
            DISTINCT 
                {row_id},
                t1.person_id,
                {concept_id}
        FROM {schema}.{source_table} t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.{concept_id} = t3.concept_id
        WHERE 
            CAST({concept_date} AS DATE) < CAST(t2.{index_date} AS DATE) 
            AND standard_concept = 'S'
        {limit_str}
    )
"""

distinct_concept_query_dict = {
    key: distinct_concept_query.format_map({**param_dict, **value})
    for key, value in table_keys_dict.items()
}

In [92]:
query = distinct_concept_query_dict['note']
df = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2664.90rows/s]


In [94]:
distinct_result_dict = {
    key: pd.read_gbq(query, dialect='standard')
    for key, query in distinct_concept_query_dict.items()
}

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 5605.30rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2672.20rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 5078.24rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 5406.19rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 3301.15rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 5514.66rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2612.67rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2846.00rows/s]


In [108]:
# Parameterized query that grabs all standard concept occurrences in a time range

binned_concept_query = """
    SELECT 
        {row_id},
        person_id,
        index_date,
        concept_id,
        concept_date,
        analysis_id,
        CONCAT('bin_', {bin_left}, '_', {bin_right}) AS time_bin
    FROM (
        SELECT 
            {row_id},
            t1.person_id,
            CAST(t2.{index_date} AS DATE) as index_date, 
            {concept_id} AS concept_id, 
            CAST({concept_date} AS DATE) AS concept_date,
            '{analysis_id}' AS analysis_id
        FROM {schema}.{source_table} t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.{concept_id} = t3.concept_id
        WHERE 
            CAST({concept_date} AS DATE) BETWEEN 
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_left} DAY) AND
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_right} DAY)
            AND standard_concept = 'S'
        {limit_str}
    )
"""

binned_concept_query_dict = {
    key: binned_concept_query.format_map(
        {**param_dict, **value, **{'bin_left': str(-30), 'bin_right': str(-1)}})
    for key, value in table_keys_dict.items()
}

In [110]:
query = binned_concept_query_dict['visit_occurrence']
df = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 3814.63rows/s]


In [111]:
df

Unnamed: 0,prediction_id,person_id,index_date,concept_id,concept_date,analysis_id,time_bin
0,365837,31932428,2018-12-08,9203,2018-11-08,visit_occurrence,bin_-30_-1
1,482888,32575784,2018-05-14,9203,2018-05-13,visit_occurrence,bin_-30_-1
2,68028,30298744,2015-12-13,9203,2015-11-13,visit_occurrence,bin_-30_-1
3,325539,31720581,2015-12-11,9203,2015-12-10,visit_occurrence,bin_-30_-1
4,325539,31720581,2015-12-11,9203,2015-11-24,visit_occurrence,bin_-30_-1
...,...,...,...,...,...,...,...
995,157999,30790406,2018-11-29,9201,2018-10-30,visit_occurrence,bin_-30_-1
996,227948,31183775,2015-05-24,9201,2015-05-03,visit_occurrence,bin_-30_-1
997,448647,32392678,2011-02-09,9201,2011-02-03,visit_occurrence,bin_-30_-1
998,16341,30021950,2012-06-19,9201,2012-06-18,visit_occurrence,bin_-30_-1


In [112]:
binned_result_dict = {
    key: pd.read_gbq(query, dialect='standard')
    for key, query in binned_concept_query_dict.items()
}

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1073.80rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1368.93rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1111.44rows/s]
Downloading: 100%|██████████| 1000/1000 [00:01<00:00, 875.93rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1391.49rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1002.65rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1296.77rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 1307.39rows/s]


In [116]:
binned_distinct_concept_query = """
    SELECT 
    {row_id},
    {concept_id} AS concept_id,
    '{analysis_id}' AS analysis_id,
    CONCAT('bin_', {bin_left}, '_', {bin_right}) AS time_bin
    FROM (
        SELECT 
            DISTINCT 
                {row_id},
                t1.person_id,
                {concept_id}
        FROM {schema}.{source_table} t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.{concept_id} = t3.concept_id
        WHERE 
            CAST({concept_date} AS DATE) BETWEEN 
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_left} DAY) AND
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_right} DAY)
            AND standard_concept = 'S'
        {limit_str}
    )
"""

binned_distinct_concept_query_dict = {
    key: binned_distinct_concept_query.format_map(
        {**param_dict, **value, **{'bin_left': str(-30), 'bin_right': str(-1)}})
    for key, value in table_keys_dict.items()
}

In [118]:
binned_distinct_result_dict = {
    key: pd.read_gbq(query, dialect='standard')
    for key, query in binned_distinct_concept_query_dict.items()
}

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2267.02rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 4808.06rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 6202.20rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2586.46rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2573.66rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2771.98rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 4051.70rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 5551.96rows/s]


In [120]:
# Get counts
count_query = """
    SELECT 
        *, 
        COUNT(*) AS concept_count
    FROM (
        SELECT 
            {row_id},
            t1.person_id,
            CAST(t2.{index_date} AS DATE) as index_date, 
            {concept_id} AS concept_id, 
            CAST({concept_date} AS DATE) AS concept_date,
            '{analysis_id}' AS analysis_id,
            CONCAT('bin_', {bin_left}, '_', {bin_right}) AS time_bin
        FROM {schema}.{source_table} t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.{concept_id} = t3.concept_id
        WHERE 
            CAST({concept_date} AS DATE) BETWEEN 
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_left} DAY) AND
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_right} DAY)
            AND standard_concept = 'S'
        {limit_str}
    )
    GROUP BY {row_id}, person_id, concept_id, analysis_id, time_bin
"""
count_query_dict = {
    key: count_query.format_map(
        {**param_dict, **value, **{'bin_left': str(-30), 'bin_right': str(-1)}})
    for key, value in table_keys_dict.items()
}

In [122]:
count_query_result_dict = {
    key: pd.read_gbq(query, dialect='standard')
    for key, query in count_query_dict.items()
}

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2397.38rows/s]
Downloading: 100%|██████████| 997/997 [00:00<00:00, 5004.08rows/s]
Downloading: 100%|██████████| 816/816 [00:00<00:00, 2005.81rows/s]
Downloading: 100%|██████████| 994/994 [00:00<00:00, 2582.62rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 4779.89rows/s]
Downloading: 100%|██████████| 991/991 [00:00<00:00, 6025.67rows/s]
Downloading: 100%|██████████| 998/998 [00:00<00:00, 1968.27rows/s]
Downloading: 100%|██████████| 998/998 [00:00<00:00, 5228.43rows/s]


In [141]:
import itertools

In [4]:
# run a full test extraction
time_bins = pd.DataFrame({
    'bin_0': (-30, -1),
    'bin_1': (-90, -31),
    'bin_2': (-180, -91),
    'bin_3': (-365, -181),
    'bin_4': (-365*100, -1)
}).transpose().rename(columns = {0:'bin_left', 1:'bin_right'}).to_dict('records')

In [5]:
time_bins

[{'bin_left': -30, 'bin_right': -1},
 {'bin_left': -90, 'bin_right': -31},
 {'bin_left': -180, 'bin_right': -91},
 {'bin_left': -365, 'bin_right': -181},
 {'bin_left': -36500, 'bin_right': -1}]

In [153]:
queries = [count_query.format_map({**param_dict, **x[0], **x[1]}) for x in itertools.product(time_bins, table_keys_dict.values())]

In [155]:
result_dict = {}
for i, query in enumerate(queries):
    result_dict[i] = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2533.01rows/s]
Downloading: 100%|██████████| 997/997 [00:00<00:00, 4275.33rows/s]
Downloading: 100%|██████████| 816/816 [00:00<00:00, 5053.80rows/s]
Downloading: 100%|██████████| 994/994 [00:00<00:00, 2331.60rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 4876.94rows/s]
Downloading: 100%|██████████| 991/991 [00:00<00:00, 5805.85rows/s]
Downloading: 100%|██████████| 998/998 [00:00<00:00, 2630.43rows/s]
Downloading: 100%|██████████| 998/998 [00:00<00:00, 2779.87rows/s]
Downloading: 100%|██████████| 999/999 [00:00<00:00, 5267.89rows/s]
Downloading: 100%|██████████| 996/996 [00:00<00:00, 2599.28rows/s]
Downloading: 100%|██████████| 765/765 [00:00<00:00, 2126.06rows/s]
Downloading: 100%|██████████| 991/991 [00:00<00:00, 4839.22rows/s]
Downloading: 100%|██████████| 1000/1000 [00:00<00:00, 2620.60rows/s]
Downloading: 100%|██████████| 984/984 [00:00<00:00, 2897.15rows/s]
Downloading: 100%|██████████| 993/993 [00:00<00:00, 2630

KeyboardInterrupt: 

Getting demographics

In [21]:


demographics_query = """
    SELECT 
        {row_id}, 
        t1.person_id, 
        {concept_id} AS concept_id, 
        {index_date} AS index_date,
        '{analysis_id}' as analysis_id
    FROM {schema}.person t1
    INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
        t1.person_id = t2.person_id
"""

demographics_keys_dict = {
    'gender': {
        'concept_id': 'gender_concept_id',
        'analysis_id': 'gender'
    },
    'race': {
        'concept_id': 'race_concept_id',
        'analysis_id': 'race'
    },
    'ethnicity': {
        'concept_id': 'ethnicity_concept_id',
        'analysis_id': 'ethnicity'
    }
}

demographics_query = demographics_query.format_map({**param_dict, **demographics_keys_dict['gender']})
demographics_query

age_query = """
    SELECT *,
    CONCAT('age_group_', CAST(FLOOR(SAFE_DIVIDE(age_in_years, CAST({age_bin_size} AS INT64))) AS STRING)) AS concept_id
    FROM (
        SELECT {row_id}, t1.person_id, '{analysis_id}' as analysis_id, {index_date} AS index_date,
        DATE_DIFF(CAST({index_date} AS DATE), CAST(t1.birth_datetime AS DATE), YEAR) AS age_in_years
        FROM {schema}.person t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
    )
"""

age_keys_dict = {
    'age_group': {
        'analysis_id': 'age_group',
        'age_bin_size': 5
    }
}
query = age_query.format_map({**param_dict, **age_keys_dict['age_group']})

In [22]:
df = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 271460/271460 [00:15<00:00, 17413.73rows/s]


In [34]:
### Measurement range query
measurement_range_query = """
    SELECT 
        {row_id}, person_id, index_date, concept_id, analysis_id, time_bin, 
        COUNT(*) AS concept_count
    FROM (
        SELECT 
            {row_id},
            t1.person_id,
            CAST(t2.{index_date} AS DATE) as index_date, 
            CAST(measurement_date AS DATE) AS concept_date,
            '{analysis_id}' AS analysis_id,
            CONCAT('bin_', {bin_left}, '_', {bin_right}) AS time_bin,
            CONCAT(
                CAST(measurement_concept_id AS STRING),
                '_',
                CASE
                    WHEN value_as_number > range_high THEN 'abnormal_high'
                    WHEN value_as_number < range_low THEN 'abnormal_low'
                    ELSE 'normal'
                END
            ) AS concept_id
        FROM {schema}.measurement t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.concept AS t3 ON
            t1.measurement_concept_id = t3.concept_id
        WHERE 
            CAST(measurement_date AS DATE) BETWEEN 
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_left} DAY) AND
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_right} DAY)
            AND standard_concept = 'S'
            AND value_as_number is NOT NULL
            AND range_low is NOT NULL
            AND range_high is NOT NULL
        {limit_str}
    )
    GROUP BY {row_id}, person_id, index_date, concept_id, analysis_id, time_bin
"""
query = measurement_range_query.format_map({**param_dict, **{'analysis_id': 'measurement_range'}, **{'bin_left': -30, 'bin_right': -1}})
df = pd.read_gbq(query, dialect='standard')

Downloading: 100%|██████████| 992/992 [00:00<00:00, 2271.10rows/s]


In [35]:
df

Unnamed: 0,prediction_id,person_id,index_date,concept_id,analysis_id,time_bin,concept_count
0,81430,30380552,2012-02-11,3018677_measurement_normal,measurement_range_group,bin_-30_-1,1
1,54675,30237770,2016-05-29,3034426__abnormal_high,measurement_range_group,bin_-30_-1,1
2,192195,30980336,2008-11-07,3018677_measurement_normal,measurement_range_group,bin_-30_-1,1
3,233192,31219337,2011-05-03,3018677_measurement_normal,measurement_range_group,bin_-30_-1,1
4,276112,31448981,2010-12-04,3034426_measurement_normal,measurement_range_group,bin_-30_-1,1
...,...,...,...,...,...,...,...
987,485078,32582026,2012-12-11,3022217_measurement_normal,measurement_range_group,bin_-30_-1,1
988,22365,30051726,2014-08-27,3034426_measurement_normal,measurement_range_group,bin_-30_-1,1
989,254014,31324831,2012-06-21,3022217_measurement_normal,measurement_range_group,bin_-30_-1,1
990,292415,31529007,2015-09-18,3034426__abnormal_high,measurement_range_group,bin_-30_-1,1


In [58]:
## Note NLP
note_nlp_query = """
    SELECT 
        *, 
        CONCAT(concept_id, '_', time_bin, '_', analysis_id) AS feature_id,
        COUNT(*) AS concept_count
    FROM (
        SELECT 
            {row_id},
            t2.person_id,
            CAST(t2.{index_date} AS DATE) as index_date, 
            CONCAT(CAST(note_nlp_concept_id AS STRING), '_', term_exists) AS concept_id,
            '{analysis_id}' AS analysis_id,
            CONCAT('bin_', {bin_left}, '_', {bin_right}) AS time_bin
        FROM {schema}.note t1
        INNER JOIN {rs_schema}.{cohort_name} AS t2 ON
            t1.person_id = t2.person_id
        INNER JOIN {schema}.note_nlp AS t3 ON
            t1.note_id = t3.note_id
        INNER JOIN {schema}.concept AS t4 ON
            t3.note_nlp_concept_id = t4.concept_id       
        WHERE 
            CAST(note_date AS DATE) BETWEEN 
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_left} DAY) AND
                DATE_ADD(CAST(t2.{index_date} AS DATE), INTERVAL {bin_right} DAY)
            AND standard_concept = 'S'
        {limit_str}
    )
    GROUP BY {row_id}, person_id, index_date, concept_id, analysis_id, time_bin
"""
query = note_nlp_query.format_map(
    {**param_dict, **{'analysis_id': 'note_nlp'}, **{'bin_left': -30, 'bin_right': -1}}
)

In [59]:
df = pd.read_gbq(query, dialect='standard', use_bqstorage_api=True)

Downloading: 100%|██████████| 740/740 [00:00<00:00, 983.22rows/s]


In [60]:
df

Unnamed: 0,prediction_id,person_id,index_date,concept_id,analysis_id,time_bin,feature_id,concept_count
0,120308,30587677,2017-02-17,36310692_Y,note_nlp,bin_-30_-1,36310692_Y_bin_-30_-1_note_nlp,1
1,120308,30587677,2017-02-17,4118465_Y,note_nlp,bin_-30_-1,4118465_Y_bin_-30_-1_note_nlp,1
2,120308,30587677,2017-02-17,4116858_Y,note_nlp,bin_-30_-1,4116858_Y_bin_-30_-1_note_nlp,2
3,120308,30587677,2017-02-17,4123549_Y,note_nlp,bin_-30_-1,4123549_Y_bin_-30_-1_note_nlp,1
4,120308,30587677,2017-02-17,4021813_Y,note_nlp,bin_-30_-1,4021813_Y_bin_-30_-1_note_nlp,1
...,...,...,...,...,...,...,...,...
735,140624,30696609,2017-04-16,4116872_Y,note_nlp,bin_-30_-1,4116872_Y_bin_-30_-1_note_nlp,1
736,140624,30696609,2017-04-16,4090419_Y,note_nlp,bin_-30_-1,4090419_Y_bin_-30_-1_note_nlp,1
737,140624,30696609,2017-04-16,4263689_Y,note_nlp,bin_-30_-1,4263689_Y_bin_-30_-1_note_nlp,1
738,140624,30696609,2017-04-16,4300360_Y,note_nlp,bin_-30_-1,4300360_Y_bin_-30_-1_note_nlp,1


In [8]:
import pandas as pd
import dask
import dask.dataframe as dd
dask.config.set(scheduler='threads')
import os
import glob

In [3]:
# Get a vocabulary
the_path = '/share/pi/nigam/spfohl/cohorts/admissions/starr_20200404/features_by_analysis'
# paths = glob.glob(os.path.join(the_path, '**', '*.parquet'), recursive=True)

In [36]:
def get_vocab(the_path):
    paths = glob.glob(os.path.join(the_path, '**', '*.parquet'), recursive=True)
    vocab = pd.concat({
        filename: pd.read_parquet(filename, columns = ['feature_id']).drop_duplicates()
        for filename in paths
    }, ignore_index=True).drop_duplicates().reset_index(drop=True).rename_axis('col_id').reset_index()
    return vocab

def get_vocab2(the_path):
    paths = glob.glob(os.path.join(the_path, '**', '*.parquet'), recursive=True)
    vocab = dd.concat([
        dd.read_parquet(filename, columns = ['feature_id']).drop_duplicates()
        for filename in paths
        ]).drop_duplicates().compute().reset_index(drop=True).rename_axis('col_id').reset_index()
    return vocab

In [37]:
vocab = get_vocab(the_path)
vocab2 = get_vocab2(the_path)

In [39]:
vocab

Unnamed: 0,col_id,feature_id
0,0,0_gender
1,1,8532_gender
2,2,2000001568_bin_-36500_-1_device_exposure
3,3,2000001019_bin_-36500_-1_device_exposure
4,4,2000002535_bin_-36500_-1_device_exposure
...,...,...
41864,41864,3016723_abnormal_high_bin_-180_-91_measurement...
41865,41865,3017501_abnormal_high_bin_-180_-91_measurement...
41866,41866,3022217_abnormal_high_bin_-180_-91_measurement...
41867,41867,3034426_abnormal_high_bin_-180_-91_measurement...


In [38]:
vocab2

Unnamed: 0,col_id,feature_id
0,0,0_gender
1,1,8532_gender
2,2,2000001568_bin_-36500_-1_device_exposure
3,3,2000001019_bin_-36500_-1_device_exposure
4,4,2000002535_bin_-36500_-1_device_exposure
...,...,...
41864,41864,3016723_abnormal_high_bin_-180_-91_measurement...
41865,41865,3017501_abnormal_high_bin_-180_-91_measurement...
41866,41866,3022217_abnormal_high_bin_-180_-91_measurement...
41867,41867,3034426_abnormal_high_bin_-180_-91_measurement...


In [24]:
paths = glob.glob(os.path.join(the_path, '**', '*.parquet'), recursive=True)
table_df = (
    dd.concat(
        [
            dd.read_parquet(
                path, 
                columns = ['prediction_id', 'person_id', 'feature_id', 'concept_count']
            ) for path in paths
        ], interleave_partitions=True
    )
    .merge(vocab)
)

In [22]:
features_row_id_map = (
    df[['prediction_id']]
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={'index':'features_row_id'})
    .compute()
)

In [23]:
features_row_id_map

Unnamed: 0,features_row_id,prediction_id
0,0,323016
1,1,323019
2,2,201476
3,3,323015
4,4,29451
...,...,...
115929,115929,70427
115930,115930,416588
115931,115931,80706
115932,115932,218084


In [25]:
table_df = table_df.merge(features_row_id_map)

In [27]:
table_df = table_df.set_index('prediction_id')

In [28]:
import scipy.sparse as sp

In [29]:
table_df = table_df.compute()

In [32]:
features = sp.csr_matrix(
    (table_df.concept_count, (table_df['features_row_id'], table_df['col_id']))
)

In [33]:
features

<115934x41869 sparse matrix of type '<class 'numpy.int64'>'
	with 476253 stored elements in Compressed Sparse Row format>

In [None]:
# df_dict = pd.concat({
#     filename: pd.read_parquet(filename, columns = ['feature_id']).drop_duplicates()
#     for filename in paths
# }, ignore_index=True).drop_duplicates().reset_index(drop=True).rename_axis('col_id').reset_index()

In [81]:
# vocabulary = pd.concat(df_dict, ignore_index=True).drop_duplicates().reset_index(drop=True).rename_axis('col_id').reset_index()

In [82]:
# vocabulary

Unnamed: 0,col_id,feature_id
0,0,0_gender
1,1,8532_gender
2,2,8507_gender
3,3,2000002537_bin_-36500_-1_device_exposure
4,4,2000002535_bin_-36500_-1_device_exposure
...,...,...
393512,393512,3032370_abnormal_high_bin_-180_-91_measurement...
393513,393513,3007461_normal_bin_-180_-91_measurement_range
393514,393514,3009201_normal_bin_-180_-91_measurement_range
393515,393515,3009744_normal_bin_-180_-91_measurement_range
