## Exploratory Data Analysis and Visualization
Here, we do some preliminary data exploration. First, we import packages and do a train/test split on the records with a PA form. We stratify on the results of the PA form (approved/denied).

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.axes
from sklearn.model_selection import train_test_split

In [None]:
cmm = pd.read_csv("Data/CMM.csv")

In [None]:
cmm_pa = cmm[cmm['dim_pa_id'].notna()]
cmm_pa_train, cmm_pa_test = train_test_split(cmm_pa, test_size = 0.2, 
                                             random_state = 10475, shuffle = True,
                                            stratify = cmm_pa.pa_approved)

We should only look at cmm_pa_train from this point.

We also compute how is_approved correlates with some other discrete variables. 

## pa_approved vs correct_diagnosis
Let's compute the correlation of pa_approved with correct diagnosis. 

In [None]:
#An auxilliary function to find the percentage of df[column1 == val1] given that df[column2 == val2 ]. 
def percentage_given(df, column1, val1, column2, val2):
    intersection = np.sum(np.logical_and(df[column1] == val1,df[column2] == val2));
    total = np.sum(df[column2] == val2);
    return np.round(100*intersection/total,3);

In [None]:
plt.figure()
sns.histplot(cmm_pa_train, x='correct_diagnosis', hue = 'pa_approved',multiple = 'stack',discrete=True)
plt.xticks(ticks=[0,1], labels=["no","yes"],fontsize=18)
plt.title("Correct Diagnosis and Approval",fontsize = 30)
plt.xlabel("Correct Diagnosis",fontsize=25)
plt.ylabel("Count",fontsize = 25)
plt.legend(["Yes", "No"],title="PA Approved",fontsize=13,title_fontsize=15)


plt.show()

In [None]:
print("The percentage of people whose PA is approved given that their diagnosis is correct is: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'correct_diagnosis',1))

print("The percentage of people whose PA is approved given that their diagnosis is incorrect is: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'correct_diagnosis',0))

## pa_approved vs contraindication
Now we compute the correlation between pa_approved and contraindication

In [None]:
plt.figure()
sns.histplot(cmm_pa_train, x='contraindication', hue = 'pa_approved',multiple = 'stack',discrete=True)
plt.xticks(ticks=[0,1], labels=["no","yes"],fontsize=18)
plt.title("Contraindication and Approval",fontsize = 30)
plt.xlabel("Contraindication",fontsize=25)
plt.ylabel("Count",fontsize = 25)
plt.legend(["Yes", "No"],title="PA Approved",fontsize=13,title_fontsize=15)

plt.show()


In [None]:
print("The percentage of people whose PA is approved given contraindication: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'contraindication',1))

print("The percentage of people whose PA is approved given no contraindication: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'contraindication',0))

## pa_approved vs tried_and_failed


In [None]:
plt.figure(figsize=(10,8))
sns.histplot(cmm_pa_train, x='tried_and_failed', hue = 'pa_approved',multiple = 'stack',discrete=True)
plt.xticks(ticks=[0,1], labels=["no","yes"],fontsize=18)
plt.title("Tried and Failed vs Approval",fontsize = 30)
plt.xlabel("Tried and Failed",fontsize=25)
plt.xlim([-1,2.5])
plt.ylabel("Count",fontsize = 25)
plt.legend(["Yes", "No"],title="PA Approved",fontsize=13,title_fontsize=15)



plt.show()

In [None]:
print("The percentage of people whose PA is approved given that the patient has tried and failed generic alternatives: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'tried_and_failed',1))

print("The percentage of people whose PA is approved given that the patiend has not tried and failed the generic alternative: "
      , percentage_given(cmm_pa_train,'pa_approved',1,'tried_and_failed',0))

## pa_approved vs drug 

In [None]:
plt.figure()
sns.histplot(cmm_pa_train, x='drug', hue = 'pa_approved',multiple = 'stack')
plt.xticks(ticks=[0,1,2], labels=["A","B","C"],fontsize=18)
plt.title("Drug Type and Approval",fontsize = 30)
plt.xlabel("Drug",fontsize=25)
plt.ylabel("Count",fontsize = 25)
plt.legend(["Yes", "No"],title="PA Approved",fontsize=13,title_fontsize=15)


plt.show()

In [None]:
drugs = ['A', 'B' , 'C']
for drug in drugs:
    print("The percentage of people whose PA is approved given that they use drug ", drug, "is : "
      , percentage_given(cmm_pa_train,'pa_approved',1,'drug',drug))

## pa_approved vs bin

In [None]:
#Takes a long time to run based on payer BIN, because it was a numpy int64. We change type to string,
#as there are only 4 payer BINs.

cmm_pa_train.loc[:,'binS']=cmm_pa_train.loc[:,'bin'].astype(str).copy()

In [None]:
#Commented because it takes very long to run
plt.figure()
sns.histplot(cmm_pa_train, x='binS', hue = 'pa_approved',multiple = 'stack')
plt.xticks(fontsize=15)
plt.title("Payer BIN and Approval",fontsize = 30)
plt.xlabel("Payer BIN",fontsize=25)
plt.ylabel("Count",fontsize = 25)
plt.legend(["Yes", "No"],title="PA Approved",fontsize=13,title_fontsize=15)



plt.show()

In [None]:
bins = cmm_pa_train.bin.unique()
for bin in bins:
    print("The percentage of people whose PA is approved given that they use the payer with BIN ", bin, "is : "
      , percentage_given(cmm_pa_train,'pa_approved',1,'bin',bin))