# Patient engagement prediction
Distribution of variables taken from [this study](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6243417/)

In [2]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score


In [11]:
#First time patient (self.first)
#0 = F
#1 = T
#Translator Needed (self.translator)
#0 = F
#1 = T
#Race (self.race)
#0 = Native American/Alaskan Native
#1 = Asian
#2 = Black
#3 = Multiple
#4 = Native Hawaiian
#5 = White
#Ethnicity (self.ethnicity)
#0 = Hispanic/Latino
#1 = Not Nispanic/Latino
#2 = Unspecified
#Gender (self.gender)
#0 = Male
#1 = Female
#Marital Status (self.marital_s)
#0 = Divorced
#1 = Legally Separated
#2 = Married
#3 = Partner
#4 = Single
#5 = Widow
#Cell phone ownership (self.cell)
#0 = No
#1 = Yes
#Email availability (self.email)
#0 = No
#1 = Yes
#Using patient portal (self.portal)
#0 = No
#1 = Yes
#Employment status (self.employment)
#0 = Employed full-time
#1 = Part time
#2 = Not Employed
#3 = Retired
#4 = Self employed
#Insurance (self.insurance)
#0 = Commercial
#1 = Marketplace
#2 = Medicaid
#3 = Medicare
#4 = Self pay
#Smoking status (self.smoking)
#0 = Every day
#1 = Some days
#2 = Former
#3 = Never
#Came to appointment(self.appointment)
#0 = no
#1 = yes

In [59]:
class Patient_Attended:
    def __init__(self):
        self.first = random.choices([0, 1], weights=(97.9,2.1))
        self.translator = random.choices([0, 1], weights=(84.8,15.2))
        self.race = random.choices([0,1,2,3,4,5], weights=(.1,4.2,30.3,3.9,1.1,60.4))
        self.ethnicity = random.choices([0,1,2], weights=(19.6,75,5.4))
        self.gender = random.choices([0,1], weights=(38.6,61.4))
        self.marital_s = random.choices([0,1,2,3,4,5], weights=(3.3,1.3,12.8,.4,80.8,1.2))
        self.cell = random.choices([0,1], weights=(18.2,81.8)) 
        self.email = random.choices([0,1], weights=(70.6,29.4))
        self.portal = random.choices([0,1], weights=(78.2,21.8))
        self.employment = random.choices([0,1,2,3,4], weights=(13,5.1,79.6,1.5,.5))
        self.insurance = random.choices([0,1,2,3,4], weights=(14.8,.6,66.8,5.6,12.2))
        self.smoking = random.choices([0,1,2,3], weights=(22.8,2.8,13,61.3))
        self.appointment = 1
        
class Patient_Missed:
    def __init__(self):
        self.first = random.choices([0, 1], weights=(97.6,2.4))
        self.translator = random.choices([0, 1], weights=(92,8))
        self.race = random.choices([0,1,2,3,4,5], weights=(.1,2,37.7,3.7,.07,55.7))
        self.ethnicity = random.choices([0,1,2], weights=(11.9,80.2,7.9))
        self.gender = random.choices([0,1], weights=(35.2,64.8))
        self.marital_s = random.choices([0,1,2,3,4,5], weights=(3.1,1.7,9.5,.3,83.4,.8))
        self.cell = random.choices([0,1], weights=(26.4,73.6)) 
        self.email = random.choices([0,1], weights=(74.5,25.5))
        self.portal = random.choices([0,1], weights=(83.5,16.5))
        self.employment = random.choices([0,1,2,3,4], weights=(10.8,5.5,82.4,.4,.3))
        self.insurance = random.choices([0,1,2,3,4], weights=(8.4,.3,69,3.6,18.7))
        self.smoking = random.choices([0,1,2,3], weights=(35.5,3.4,12,49.1))
        self.appointment = 0
        
        

In [76]:
df_dict = {'first':[],
        'translator':[],
        'race':[],
        'ethnicity':[],
        'gender':[],
        'marital_s':[],
        'cell':[],
        'email':[],
        'portal':[],
        'employment':[],
        'insurance':[],
        'smoking':[],
        'attended_appointment':[]  
       }

for i in range(100000):
    pa = Patient_Attended()
    pm = Patient_Missed()
    
    df_dict['first'].append(pa.first[0])
    df_dict['translator'].append(pa.translator[0])
    df_dict['race'].append(pa.race[0])
    df_dict['ethnicity'].append(pa.ethnicity[0])
    df_dict['gender'].append(pa.gender[0])
    df_dict['marital_s'].append(pa.marital_s[0])
    df_dict['cell'].append(pa.cell[0])
    df_dict['email'].append(pa.email[0])
    df_dict['portal'].append(pa.portal[0])
    df_dict['employment'].append(pa.employment[0])
    df_dict['insurance'].append(pa.insurance[0])
    df_dict['smoking'].append(pa.smoking[0])
    df_dict['attended_appointment'].append(pa.appointment)
    
    df_dict['first'].append(pm.first[0])
    df_dict['translator'].append(pm.translator[0])
    df_dict['race'].append(pm.race[0])
    df_dict['ethnicity'].append(pm.ethnicity[0])
    df_dict['gender'].append(pm.gender[0])
    df_dict['marital_s'].append(pm.marital_s[0])
    df_dict['cell'].append(pm.cell[0])
    df_dict['email'].append(pm.email[0])
    df_dict['portal'].append(pm.portal[0])
    df_dict['employment'].append(pm.employment[0])
    df_dict['insurance'].append(pm.insurance[0])
    df_dict['smoking'].append(pm.smoking[0])
    df_dict['attended_appointment'].append(pm.appointment)


full_df = pd.DataFrame(df_dict)

In [77]:
#Shuffle data
full_df = full_df.sample(frac = 1)



In [78]:
y_data = full_df["attended_appointment"]
x_data = full_df.loc[:, full_df.columns != "attended_appointment"]

In [79]:
x_data

Unnamed: 0,first,translator,race,ethnicity,gender,marital_s,cell,email,portal,employment,insurance,smoking
160860,0,0,5,1,0,4,1,0,1,2,2,3
118606,0,0,5,1,1,4,0,1,1,2,3,0
89575,0,0,2,1,1,4,0,1,0,2,2,3
130200,0,0,5,1,0,4,1,0,0,2,2,2
135258,0,0,2,1,1,4,1,0,0,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...
35106,0,0,5,1,1,0,0,1,1,3,0,3
162151,0,0,5,1,1,2,1,0,0,2,2,3
132233,0,0,5,1,0,4,0,1,0,2,4,3
138057,0,0,2,0,0,4,1,0,1,2,2,3


In [80]:
y_data

160860    1
118606    1
89575     0
130200    1
135258    1
         ..
35106     1
162151    0
132233    0
138057    0
186754    1
Name: attended_appointment, Length: 200000, dtype: int64

In [81]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.33, random_state=42)

In [82]:
ohe = OneHotEncoder()

In [83]:
#One hot encoding because all the variables are categorical
X_train = ohe.fit_transform(X_train)
X_test = ohe.fit_transform(X_test)

In [84]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train.to_numpy())

In [85]:
X_train
X_test

<66000x41 sparse matrix of type '<class 'numpy.float64'>'
	with 792000 stored elements in Compressed Sparse Row format>

In [86]:
y_pred = clf.predict(X_test)

In [89]:
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[20598 12452]
 [12542 20408]]


In [90]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))

Accuracy
0.6213030303030304
