# Contingency Tables (still not sure what that means)

#### Connect to the Database

In [1]:
import psycopg2 as pg
import pandas as pd

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

## GROUP BY queries

### Heart Failure by Gender and Birth Decade

In [2]:
query = """
    SELECT
        extract('year' FROM date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth))) AS birth_decade,
        CASE p.gender_concept_id WHEN 8532 THEN 'Female' ELSE 'Male' END AS gender,
        COUNT(*)
    FROM person p, condition_occurrence con_oc, concept con
    WHERE p.person_id = con_oc.person_id 
        AND con_oc.condition_concept_id = con.concept_id
        AND (con.concept_name LIKE '%Heart Failure%' OR
            con.concept_name LIKE '%heart failure%')
        AND con.domain_id = 'Condition'
        AND con.concept_class_id='Clinical Finding'
    GROUP BY
        date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth)),
        p.gender_concept_id
    ORDER BY
        birth_decade,
        gender;"""
results = pd.read_sql(query, con=db)
df = results.pivot(index='gender', columns='birth_decade', values='count')

df.columns.names = ['Birth Decade']
df.index.names = ['Gender']

heart_failure = df
heart_failure

Birth Decade,1900.0,1910.0,1920.0,1930.0,1940.0,1950.0,1960.0,1970.0,1980.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,2988,33350,99322,113796,59256,18212,11294,4540,1392
Male,1156,14692,66048,93864,53006,18256,10334,4942,1152


### HIV by Gender and Birth Decade

In [3]:
query = """
SELECT
  extract('year' FROM date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth))) AS birth_decade,
  CASE p.gender_concept_id WHEN 8532 THEN 'Female' ELSE 'Male' END AS gender,
  COUNT(*)
FROM
  condition_occurrence co,
  person p
WHERE
  co.person_id = p.person_id AND
  condition_concept_id = '4241530'
GROUP BY 
  date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth)),
  p.gender_concept_id
ORDER BY
  birth_decade,
  gender;"""
results = pd.read_sql(query, con=db)
df = results.pivot(index='gender', columns='birth_decade', values='count')
df.columns.names = ['Birth Decade']
df.index.names = ['Gender']

HIV = df
HIV

Birth Decade,1900.0,1910.0,1920.0,1930.0,1940.0,1950.0,1960.0,1970.0,1980.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,12.0,164.0,442.0,604.0,374.0,110.0,128.0,38.0,16.0
Male,,70.0,262.0,476.0,266.0,164.0,92.0,48.0,14.0


### Diabetes by Gender and Age

In [4]:
query = """
    SELECT
        extract('year' FROM date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth))) AS birth_decade,
        CASE p.gender_concept_id WHEN 8532 THEN 'Female' ELSE 'Male' END AS gender,
        COUNT(*)
    FROM person p, condition_occurrence con_oc, concept con
    WHERE p.person_id = con_oc.person_id 
        AND con_oc.condition_concept_id = con.concept_id
        AND con.concept_class_id='Clinical Finding'
        AND con_oc.condition_concept_id=con.concept_id
        AND (con.concept_name LIKE '%Diabetes%' OR
            con.concept_name LIKE '%diabetes%')
        AND con.domain_id = 'Condition'
    GROUP BY
        date_trunc('decade', make_date(p.year_of_birth, p.month_of_birth, p.day_of_birth)),
        p.gender_concept_id
    ORDER BY
        birth_decade,
        gender;"""
results = pd.read_sql(query, con=db)
df = results.pivot(index='gender', columns='birth_decade', values='count')

df.columns.names = ['Birth Decade']
df.index.names = ['Gender']

diabetes = df
diabetes

Birth Decade,1900.0,1910.0,1920.0,1930.0,1940.0,1950.0,1960.0,1970.0,1980.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,7450,83022,259308,336062,181384,50166,29618,14730,4270
Male,2316,32368,163328,260810,154894,48180,29416,13820,3994


In [33]:
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

# This is to use ployly offline
init_notebook_mode(connected=True)

iframe = None

years = list(diabetes.columns)
female_diabetics = diabetes.T['Female']
male_diabetics = diabetes.T['Male']


def age_array(df):
    temp = []
    for age,count in zip(df['Birth Decade'], df['count']):
        for i in range(int(count)):
            temp.append(age)
    return temp




#age_array(diabetes.T['Male'])
query = """
#Original data

female = go.Bar(x=years, y=female_diabetics, name="Female Diabetics", marker=dict(
        color='#FFA500'), opacity=0.75)
male = go.Bar(x=years, y=male_diabetics, name="Male Diabetics", marker=dict(
        color='#0000FF'), opacity=0.75)

layout = go.Layout(
    title='Age Distribution of Male Diabetics vs Female Diabetics', 
    xaxis={'title':'Age', 'tickangle': 300, 'exponentformat': 'none'}, 
    yaxis={'title':'Occurences'},
    showlegend=True,
    bargap=0.1,
    barmode='overlay')

data_all = [male, female]

fig = go.Figure(data=data_all, layout=layout)

iplot(fig, filename='overlaid histogram')"""

[2316, 32368, 163328, 260810, 154894, 48180, 29416, 13820, 3994]

### Drugs by Gender

In [19]:
query = """


SELECT
  c.concept_name,
  CASE p.gender_concept_id WHEN 8532 THEN 'Female' ELSE 'Male' END AS gender,
  count(*) AS total
FROM
  person p,
  drug_exposure de,
  top_drugs td,
  concept c
WHERE
  c.concept_id = de.drug_concept_id AND
  de.drug_concept_id = td.concept_id AND
  de.person_id = p.person_id
GROUP BY
  c.concept_name,
  gender
ORDER BY 
  c.concept_name,
  gender;
"""

#results = pd.read_sql(query, con=db)
#results