In [8]:
import psycopg2 as pg
import pandas as pd

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

# Basic Row Counts

Get basic (not 100% accurate) row counts from the `pg_stat_user_tables` table.

In [19]:
df = pd.read_sql('select relname as "Table", n_live_tup "Row Count" from pg_stat_user_tables where n_live_tup > 0 order by n_live_tup desc', con=db)
print(df)

                   Table  Row Count
0         procedure_cost   31821624
1   condition_occurrence   14455883
2   procedure_occurrence   13926500
3   concept_relationship    9254447
4       concept_ancestor    8612675
5          drug_exposure    6303448
6       visit_occurrence    5579533
7              drug_cost    5552376
8            measurement    3704927
9        concept_synonym    3391372
10               concept    2065005
11           observation    1876834
12              provider     635456
13     payer_plan_period     389231
14             care_site     239158
15       device_exposure     224505
16         drug_strength     195736
17                person     116352
18    observation_period     104891
19                 death       5461
20              location       3088
21          relationship        442
22         concept_class        315
23                domain         40
24            vocabulary         27


# Identify Diabetes Patients

Try to figure who has had diabetes. Start with trying to find the concepts that represent diabetes.

In [20]:
print(pd.read_sql("SELECT count(*) FROM concept WHERE concept_name LIKE '%Diabetes%' OR concept_name LIKE '%diabetes%'", con=db))

   count
0   1121


We are actually only looking for conditions that are clinical findings, so we can do better:

In [21]:
diabetes_concepts_query = """
SELECT
  count(*)
FROM
  concept
WHERE
  (
    concept_name LIKE '%Diabetes%'
    OR concept_name LIKE '%diabetes%'
  )
  AND domain_id = 'Condition'
  AND concept_class_id = 'Clinical Finding' LIMIT 1000;"""

print(pd.read_sql(diabetes_concepts_query, con=db))

   count
0    605


With this limited subset of concepts, we can count the number of people that have ever had a condition matching one of those codes as follows:

In [22]:
diabetes_patient_count = """
SELECT
  COUNT( DISTINCT person_id )
FROM
  condition_occurrence
WHERE
  condition_concept_id IN(
    SELECT
      concept_id
    FROM
      concept
    WHERE
      (
        concept_name LIKE '%Diabetes%'
        OR concept_name LIKE '%diabetes%'
      )
      AND domain_id = 'Condition'
      AND concept_class_id = 'Clinical Finding' LIMIT 1000
  );"""

print(pd.read_sql(diabetes_patient_count, con=db))

   count
0  81051


Using this simple count query, we can still discover if a person (e.g. the person with `person_id = 2`) has diabetes or not:

In [24]:
diabetes_patient_count = """
SELECT
  COUNT( DISTINCT person_id )
FROM
  condition_occurrence
WHERE
  condition_concept_id IN(
    SELECT
      concept_id
    FROM
      concept
    WHERE
      (
        concept_name LIKE '%Diabetes%'
        OR concept_name LIKE '%diabetes%'
      )
      AND domain_id = 'Condition'
      AND concept_class_id = 'Clinical Finding' LIMIT 1000
  ) AND person_id != 2;"""

print(pd.read_sql(diabetes_patient_count, con=db))

   count
0  81050


Since the number retured is one fewer, we can deduce that the person with `person_id = 2` does have diabe