# Pharmacy Groups
Group the raven pharmacy data into usable groups based on standard reference tables

**Script**
* [scripts/cld/pharmacy_groups.ipynb](./scripts/cld/pharmacy_groups.ipynb)

**Prior Script(s)**
* [scripts/dc/raven_pharmacy_dedup.ipynb](./scripts/dc/raven_pharmacy_dedup.ipynb)

**Parameters**
* `in/cld/pharmacy_groups.xlsx[ref]`

**Input**
* `dc_rxdedup_final`
  
**Output**
* `cld_phar_grp`

**Review**
* [scripts/cld/pharmacy_groups.html](./scripts/cld/pharmacy_groups.html)
* `out/pld/pld_phar_grp_cnt.xlsx`: Counts of ndcs for all ndc codes in the reference table

In [17]:
#Import libraries for this notebook
import pandas as pd  
from drg_connect import Snowflake
import numpy as np
import pickle
from workbook_writer import make_xlsx
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#Load connection variables to connect_dict
with open('../../out/conn/connect_dict.pickle', 'rb') as handle:
    connect_dict = pickle.load(handle)

#Create Eegine to connect to snowflake
snow = Snowflake(role=connect_dict['role'],
                 warehouse=connect_dict['warehouse'],
                 database=connect_dict['database'],
                 schema=connect_dict['schema'])

#Finish engine setup
engine = snow.engine
%load_ext sql_magic
%config SQL.conn_name = 'engine'  #Set the sql_magic connection engine
%config SQL.output_result = True  #Enable output to std out
%config SQL.notify_result = False #disable browser notifications

The sql_magic extension is already loaded. To reload it, use:
  %reload_ext sql_magic


# Reference
Upload a reference table from excel to snowflake

**Input**  
* `in/cld/pharmacy_groups.xlsx[ref]`

**Output**  
* `cld_phar_grp_ref`

In [12]:
#Upload reference table from excel to snowflake and review snowflake output
df = pd.read_excel('../../in/cld/pharmacy_groups.xlsx', sheet_name='ref', skiprows=4, dtype=str)

#Strip white space and make referrable columns uppercase
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

#Upload to snowflake
snow.drop_table("cld_phar_grp_ref")
snow.upload_dataframe(df,"cld_phar_grp_ref")
snow.select("SELECT * FROM cld_phar_grp_ref LIMIT 5")

DROP TABLE IF EXISTS ref_db.semi_custom.cld_phar_grp_ref;
Table ref_db.semi_custom.cld_phar_grp_ref dropped! (╯°□°）╯︵ ┻━┻
Upload into ref_db.semi_custom.cld_phar_grp_ref successful! ┬──┬◡ﾉ(°-°ﾉ)


Unnamed: 0,class,generic_name,product_name,ndc
0,ANTIDEPRESSANTS,CLOMIPRAMINE HYDROCHLORIDE,CLOMIPRAMINE,63629275601
1,ANTIDEPRESSANTS,CLOMIPRAMINE HYDROCHLORIDE,CLOMIPRAMINE HCL,93095601
2,ANTIDEPRESSANTS,CLOMIPRAMINE HYDROCHLORIDE,CLOMIPRAMINE HCL,93095801
3,ANTIDEPRESSANTS,CLOMIPRAMINE HYDROCHLORIDE,CLOMIPRAMINE HCL,93096001
4,ANTIDEPRESSANTS,CLOMIPRAMINE HYDROCHLORIDE,CLOMIPRAMINE HCL,378302501


# Create Table
Create a small table of cleaned up pharmacy data

In [23]:
%%read_sql
DROP TABLE IF EXISTS cld_phar_grp;
CREATE TRANSIENT TABLE cld_phar_grp AS
    SELECT rx.patient_id,
           rx.claim_id,
           rx.date_of_service,
           ref.class,
           ref.generic_name,
           ref.product_name,
           ref.ndc
      FROM dc_rxdedup_final rx
           JOIN cld_phar_grp_ref ref
             ON rx.ndc = ref.ndc

Query started at 02:41:57 PM Eastern Daylight Time; Query executed in 0.04 mQuery started at 02:42:00 PM Eastern Daylight Time; Query executed in 0.07 m

Unnamed: 0,status
0,Table CLD_PHAR_GRP successfully created.


In [24]:
%%read_sql
--Basic counts to confirm things look right
SELECT Count(*) AS row_cnt,
       Count(Distinct patient_id) AS pt_cnt,
       Count(Distinct claim_id) AS claim_cnt,
       Count(Distinct date_of_service) AS dt_cnt,
       Count(Distinct class) AS class_cnt,
       Count(Distinct generic_name) AS generic_cnt,
       Count(Distinct product_name) AS product_cnt,
       Count(Distinct ndc) AS ndc_cnt
  FROM cld_phar_grp

Query started at 02:42:05 PM Eastern Daylight Time; Query executed in 0.04 m

Unnamed: 0,row_cnt,pt_cnt,claim_cnt,dt_cnt,class_cnt,generic_cnt,product_cnt,ndc_cnt
0,525841,37892,525841,1461,4,20,41,611


In [26]:
%%read_sql
--Review counts by NDC codes
DROP TABLE IF EXISTS cld_phar_grp_cnt;
CREATE TRANSIENT TABLE cld_phar_grp_cnt AS
SELECT ref.class,
       ref.generic_name,
       ref.product_name,
       ref.ndc,
       Count(rx.*) AS row_cnt,
       Count(Distinct rx.patient_id) AS pt_cnt,
       Count(Distinct rx.claim_id) AS claim_cnt
  FROM cld_phar_grp_ref ref
       LEFT JOIN cld_phar_grp rx
              ON ref.ndc = rx.ndc
 GROUP BY ref.class,
          ref.generic_name,
          ref.product_name,
          ref.ndc  
 ORDER BY row_cnt desc;

Query started at 02:42:28 PM Eastern Daylight Time; Query executed in 0.03 mQuery started at 02:42:29 PM Eastern Daylight Time; Query executed in 0.05 m

Unnamed: 0,status
0,Table CLD_PHAR_GRP_CNT successfully created.


In [27]:
#Review data output
df = snow.select("SELECT * FROM cld_phar_grp_cnt ORDER BY row_cnt DESC")
make_xlsx([df],
          "../../out/cld/pld_phar_grp_cnt.xlsx",
          "Pharmacy Group Counts by NDC",
          ["pld_phar_grp_cnt"],
          ["Pharmacy Counts"],
          ["Counts by rows, patients and claims"])