# Predict Number of Day's Supply for Atorvastatin prescribed by an HCP in 2019

References:
- https://www.udemy.com/course/statistics-for-data-science-and-business-analysis/learn/lecture/8742394#reviews
- https://www.cms.gov/newsroom/data
- https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug/data
- https://data.cms.gov/resources/medicare-part-d-prescribers-by-provider-and-drug-data-dictionary
-https://clincalc.com/DrugStats/Top300Drugs.aspx

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from linearmodels.iv import IV2SLS
from sklearn.preprocessing import OneHotEncoder

In [2]:
def read_file_filter_therapy(input_csv,drug_name_str):
    """
        Return a dataframe limited to the therapy under study
    """
    df=pd.read_csv(input_csv)
    df['Gnrc_Name']=df['Gnrc_Name'].str.strip()
    df['Gnrc_Name']=df['Gnrc_Name'].str.lower()
    df=df[df['Gnrc_Name'].str.contains(drug_name_str)]
    
    return df

def pre_process(df):
    """
        Return a dataframe: 
            - Without Nan's
            - One record per HCP
            - At least 1 beneficiary older than 65 years
            - A state attribution
    """
    df.dropna(inplace=True)
    df.drop_duplicates(subset=['Prscrbr_NPI'],inplace=True,keep=False)
    df=df[df['GE65_Tot_Benes']>0]
    df=df[~df['Prscrbr_State_Abrvtn'].isin(['XX','ZZ'])]
    
    return df

def spec_prim_care_pys(df):
    """
        Return a dataframe with 'Prscrbr_Type' as 'Primary Care Physician', if
        specialty is: 'Family Practice','Internal Medicine','General Practice'
    """
    df.loc[df['Prscrbr_Type'].isin(['Family Practice','Internal Medicine','General Practice']),'Prscrbr_Type'] = 'Primary Care Physician'
    
    return df

def select_df_for_analysis(df):
    """
        Return a dataframe with PCP's, Cardiologist, Geriatric Medicinists
        and the columns for analysis
    """
    
    df=df[df['Prscrbr_Type'].isin(['Primary Care Physician','Cardiology','Geriatric Medicine'])]
    df=df[['Prscrbr_State_Abrvtn','Prscrbr_Type','Tot_Clms','Tot_Drug_Cst','Tot_Benes','GE65_Tot_Benes','Tot_Day_Suply']]
    
    return df

def main(input_filename,drug_name):
    
    #Pre-process
    df_drug_selection=read_file_filter_therapy(input_filename,drug_name)
    df_pre_proc=pre_process(df_drug_selection)
    df_spec=spec_prim_care_pys(df_pre_proc)
    df_analysis=select_df_for_analysis(df_spec)
    
    
    
    

In [4]:
if __name__ == "__main__":
    main("input/Medicare_Part_D_Prescribers_by_Provider_and_Drug_2019.csv","atorvastatin")