In [None]:
# prompt: load my google drive
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pharmacy_file_path = '/content/drive/MyDrive/Data/All FFS Claims/pde.csv'
pharmacy = pd.read_csv(pharmacy_file_path, sep="|")
# How were columns read in?
col_pharmacy = pd.DataFrame(pharmacy.dtypes, columns=['type'])
col_pharmacy

  pharmacy = pd.read_csv(pharmacy_file_path, sep="|")


Unnamed: 0,type
PDE_ID,int64
BENE_ID,int64
SRVC_DT,object
PD_DT,object
PRSCRBR_ID_QLFYR_CD,int64
PRSCRBR_ID,int64
RX_SRVC_RFRNC_NUM,int64
PROD_SRVC_ID,int64
PLAN_CNTRCT_REC_ID,object
PLAN_PBP_REC_NUM,int64


In [None]:
pharmacy.head()

Unnamed: 0,PDE_ID,BENE_ID,SRVC_DT,PD_DT,PRSCRBR_ID_QLFYR_CD,PRSCRBR_ID,RX_SRVC_RFRNC_NUM,PROD_SRVC_ID,PLAN_CNTRCT_REC_ID,PLAN_PBP_REC_NUM,CMPND_CD,DAW_PROD_SLCTN_CD,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,FILL_NUM,DSPNSNG_STUS_CD,DRUG_CVRG_STUS_CD,ADJSTMT_DLTN_CD,NSTD_FRMT_CD,PRCNG_EXCPTN_CD,CTSTRPHC_CVRG_CD,GDC_BLW_OOPT_AMT,GDC_ABV_OOPT_AMT,PTNT_PAY_AMT,OTHR_TROOP_AMT,LICS_AMT,PLRO_AMT,CVRD_D_PLAN_PD_AMT,NCVRD_PLAN_PD_AMT,TOT_RX_CST_AMT,RX_ORGN_CD,RPTD_GAP_DSCNT_NUM,BRND_GNRC_CD,PHRMCY_SRVC_TYPE_CD,PTNT_RSDNC_CD,SUBMSN_CLR_CD
0,-10602140347,-10000010254618,25-Mar-2015,25-Mar-2015,1,9999987089,-10602140347,68115025030,Z0009,999,0,7,63,63,1,,C,,,,,0.0,0.0,0.0,0,0,55.04,220.15,0.0,275.19,4,0,G,5,1,
1,-10602140348,-10000010254618,27-May-2016,27-May-2016,1,9999999569,-10602140348,53978010903,Z0005,999,0,1,7,7,1,,C,,,,,0.0,0.0,0.0,0,0,484.13,1936.54,0.0,2420.67,0,0,B,1,1,
2,-10602140349,-10000010254618,03-Oct-2016,03-Oct-2016,1,9999997109,-10602140349,55154010000,Z0005,999,0,2,90,90,1,,C,,,,,0.0,0.0,0.0,0,0,240.98,963.93,0.0,1204.91,0,0,B,2,1,
3,-10602140350,-10000010254618,20-Sep-2017,20-Sep-2017,1,9999999569,-10602140350,13107021199,Z0005,999,0,4,40,10,1,,C,,,,,0.0,0.0,0.0,0,0,27.3,109.18,0.0,136.48,3,0,B,1,1,
4,-10602140351,-10000010254618,30-Sep-2017,30-Sep-2017,1,9999999569,-10602140351,13107021199,Z0005,999,0,3,40,10,2,,C,,,,,0.0,0.0,0.0,0,0,27.3,109.18,0.0,136.48,4,0,G,4,1,


Pharmacy data is point of sale data, so available significantly faster than other claims data.  Important fields:
1.  PDE_ID: ID for prescription
2.  BENE_ID: Beneficiary_ID
2.  PRSCRBR_ID: Prescriber NPI after 2013
3.  SRVC_DT: Date prescription was filled
3.  PROD_SRVC_ID: NDC Code in 11 digit format (CMS) FDA: 10 digit
4.  QTY_DSPNSD_NUM: Quantity of the prescription
5.  DAYS_SUPLY_NUM: How many days of the prescription were supplied?   
6.  FILL_NUM: Number of Drug Fills
7.  TOT_RX_CST_AMT: Total Part D Cost (Sum of Several Cost Variables).
8.  PHRMCY_SRVC_TYPE_CD: Type of Pharmacy 01 = Community/Retail




In [None]:
# prompt: filter pharmacy for the columns mentioned in the above text cell

cols_to_keep = ['PDE_ID', 'BENE_ID', 'PRSCRBR_ID', 'SRVC_DT', 'PROD_SRVC_ID', 'QTY_DSPNSD_NUM', 'DAYS_SUPLY_NUM', 'FILL_NUM', 'TOT_RX_CST_AMT', 'PHRMCY_SRVC_TYPE_CD']

# Filter the pharmacy DataFrame
pharmacy2 = pharmacy[cols_to_keep]

pharmacy2['BENE_ID'] = pharmacy2['BENE_ID'].astype(str)
pharmacy2['PDE_ID'] = pharmacy2['PDE_ID'].astype(str)
pharmacy2['PROD_SRVC_ID'] = pharmacy2['PROD_SRVC_ID'].astype(str)
pharmacy2['SRVC_DT'] = pd.to_datetime(pharmacy2['SRVC_DT'], format='%d-%b-%Y')
pharmacy2['PHRMCY_SRVC_TYPE_CD'] = pharmacy2['PHRMCY_SRVC_TYPE_CD'].astype(str)

pharmacy2['YR'] = pharmacy2['SRVC_DT'].dt.year

In [None]:
# prompt: Are there PDE_ID that occur more than once?

pde_counts = pharmacy['PDE_ID'].value_counts()
duplicate_pde_ids = pde_counts[pde_counts > 1]

if duplicate_pde_ids.empty:
  print("No PDE_ID values occur more than once.")
else:
  print("PDE_ID values that occur more than once:")
duplicate_pde_ids

No PDE_ID values occur more than once.


Unnamed: 0_level_0,count
PDE_ID,Unnamed: 1_level_1


In [None]:
# prompt: show me the frequency distribution of the length of PROD_SRVC_ID
pharmacy2['PROD_SRVC_ID'].str.len().value_counts()


Unnamed: 0_level_0,count
PROD_SRVC_ID,Unnamed: 1_level_1
11,342362
9,128887
7,24831
8,14105
5,3767
6,1371
10,197


In [None]:
# prompt: remove records from pharmacy2 where the length of PROD_SRVC_ID is less than 11

print(len(pharmacy2))
pharmacy2 = pharmacy2[pharmacy2['PROD_SRVC_ID'].str.len() >= 11]
print(len(pharmacy2))


515520
342362


In [None]:
pharmacy2['PHRMCY_SRVC_TYPE_CD'].value_counts()

Unnamed: 0_level_0,count
PHRMCY_SRVC_TYPE_CD,Unnamed: 1_level_1
1,49048
4,49010
3,49004
7,49003
2,48920
5,48730
6,48647


01 = Community/retail pharmacy
02 = Compounding pharmacy
03 = Home infusion therapy provider
04 = Institutional pharmacy
05 = Long-term care pharmacy
06 = Mail order pharmacy
07 = Managed care organization (MCO) pharmacy


HUGE Simplying decision.  Only include PHRMCY_SRVC_TYPE_CD	= 1

In [None]:
# prompt: filter pharmacy2 where PHRMCY_SRVC_TYPE_CD	= 1
print(len(pharmacy2))
pharmacy2 = pharmacy2[pharmacy2['PHRMCY_SRVC_TYPE_CD'] == '1']
print(len(pharmacy2))

342362
49048


In [None]:
# prompt: Create a dataframe called prescriptions (BENE_ID, YR, NUM_RX) NUM_RX is the distinct count of PROD_SRVC_ID by BENE_ID, YR

prescriptions = pharmacy2.groupby(['BENE_ID', 'YR'])['PROD_SRVC_ID'].nunique().reset_index()
prescriptions.rename(columns={'PROD_SRVC_ID': 'NUM_RX'}, inplace=True)



In [None]:
# prompt: what is the maximum of NUM_RX

# Find the maximum value of NUM_RX
max_num_rx = prescriptions['NUM_RX'].max()
print(f"The maximum value of NUM_RX is: {max_num_rx}")

The maximum value of NUM_RX is: 42


In [None]:
prescriptions.to_csv('/content/drive/MyDrive/Data/Output Data/prescriptions.csv', index=False)