In [4]:
import psycopg2 as pg
import pandas as pd

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

# Basic Row Counts

Get basic (not 100% accurate) row counts from the `pg_stat_user_tables` table.

In [5]:
pd.read_sql('select relname as "Table", n_live_tup "Row Count" from pg_stat_user_tables where n_live_tup > 0 order by n_live_tup desc', con=db)

Unnamed: 0,Table,Row Count
0,drug_labels,4538
1,heracles_analysis,185
2,sec_role_permission,110
3,sec_permission,65
4,schema_version,46
5,concept_of_interest,42
6,sec_role,6
7,cohort_study,6
8,source_daimon,3
9,batch_job_execution_params,2


# Identify Diabetes Patients

Try to figure who has had diabetes. Start with trying to find the concepts that represent diabetes.

In [4]:
pd.read_sql("SELECT count(*) FROM concept WHERE concept_name LIKE '%Diabetes%' OR concept_name LIKE '%diabetes%'", con=db)

Unnamed: 0,count
0,1121


We are actually only looking for conditions that are clinical findings, so we can do better:

In [5]:
diabetes_concepts_query = """
SELECT
  count(*)
FROM
  concept
WHERE
  (
    concept_name LIKE '%Diabetes%'
    OR concept_name LIKE '%diabetes%'
  )
  AND domain_id = 'Condition'
  AND concept_class_id = 'Clinical Finding' LIMIT 1000;"""

pd.read_sql(diabetes_concepts_query, con=db)

Unnamed: 0,count
0,605


With this limited subset of concepts, we can count the number of people that have ever had a condition matching one of those codes as follows:

In [6]:
diabetes_patient_count = """
SELECT
  COUNT( DISTINCT person_id )
FROM
  condition_occurrence
WHERE
  condition_concept_id IN(
    SELECT
      concept_id
    FROM
      concept
    WHERE
      (
        concept_name LIKE '%Diabetes%'
        OR concept_name LIKE '%diabetes%'
      )
      AND domain_id = 'Condition'
      AND concept_class_id = 'Clinical Finding' LIMIT 1000
  );"""

pd.read_sql(diabetes_patient_count, con=db)

Unnamed: 0,count
0,81051


Using this simple count query, we can still discover if a person (e.g. the person with `person_id = 2`) has diabetes or not:

In [7]:
diabetes_patient_count = """
SELECT
  COUNT( DISTINCT person_id )
FROM
  condition_occurrence
WHERE
  condition_concept_id IN(
    SELECT
      concept_id
    FROM
      concept
    WHERE
      (
        concept_name LIKE '%Diabetes%'
        OR concept_name LIKE '%diabetes%'
      )
      AND domain_id = 'Condition'
      AND concept_class_id = 'Clinical Finding' LIMIT 1000
  ) AND person_id != 2;"""

pd.read_sql(diabetes_patient_count, con=db)

Unnamed: 0,count
0,81050


Since the number retured is one fewer, we can deduce that the person with `person_id = 2` does have diabetes!