In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# 1. Read in data set, explore

In [2]:
# Load the processed data

df_med = pd.read_csv('med_data1.csv')

In [3]:
df_med.head(3)

Unnamed: 0,ID,Customer_id,Interaction,UID,zip,Lat,Lng,City,State,Population,County,Area,Timezone,Job,Children,Age,Education,Employment,Income,Marital,fav_color,Credit_card,ReAdmin,Gender,SeniorCitizen,Initial_Admin,HighBlood,Stroke,Complication_Risk,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_Days,MonthlyCharge,TotalCharge,item1,item2,item3,item4,item5,item6,item7,item8
0,1,C412403,cd86e3b5-bccb-4c38-b406-a70859af6424,1b827d78c02cdf967d83280a316fbab7,19610,40.34008,-75.97606,Reading,PA,15439,Berks,Suburban,America/New_York,"Production assistant, radio",1,79,Regular High School Diploma,Student,83147.66,Divorced,Snow,Discover,No,Female,No,Emergency Admission,Yes,No,Medium,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Blood Work,10.58577,95.018175,1034.377162,0,1,0,4,1,2,2,4
1,2,Z919181,7058d104-99d4-41ae-ad17-def063f2024c,e3d31d932fe4468908664f2b054fa31c,66080,38.40441,-95.25956,Richmond,KS,922,Franklin,Urban,America/Chicago,Retail buyer,2,31,"Some College, 1 or More Years, No Degree",Full Time,54612.56,Divorced,PaleGoldenRod,Mastercard,Yes,Male,No,Emergency Admission,Yes,No,High,Yes,No,No,No,No,No,No,Yes,No,Intravenous,15.129562,97.442069,1497.764653,3,4,3,3,2,4,1,3
2,3,F995323,4c1a465a-2a2b-4f38-bbe6-036fac234eaa,2644b097b08b5548313266f100a6e22c,45653,38.89156,-82.8231,Minford,OH,3782,Scioto,Urban,America/New_York,Forensic scientist,7,49,"Some College, 1 or More Years, No Degree",Student,7726.42,Married,WhiteSmoke,VISA 16 digit,Yes,Male,No,Elective Admission,Yes,No,Medium,Yes,No,Yes,No,No,No,No,No,No,Blood Work,4.772177,82.496547,395.125825,3,1,2,3,2,2,3,4


In [6]:
#df_med.describe()
#df_med.dtypes

In [4]:
X = df_med.copy()
print(X.shape)

(10000, 50)


# 2. Feature Engineering

## Variable transformation 

In [5]:
# transform zip to categorical

X['zip']= X['zip'].astype('str')

## Variable selection

### Continuous variables

In [6]:
# Cont cols (normalize for KNN)

# Continuous cols (normalize-for KNN)
cols_cont = ['Population', 'Children', 'Lat', 'Lng', 'Income', 'Age', 'Initial_Days', 'TotalCharge'] #leave out monthlycharge
X_cont = X[cols_cont]
X_cont.head(3)

Unnamed: 0,Population,Children,Lat,Lng,Income,Age,Initial_Days,TotalCharge
0,15439,1,40.34008,-75.97606,83147.66,79,10.58577,1034.377162
1,922,2,38.40441,-95.25956,54612.56,31,15.129562,1497.764653
2,3782,7,38.89156,-82.8231,7726.42,49,4.772177,395.125825


### Categorical variables

In [7]:
# Cat cols-full list

cols_cat_all = X.loc[:, X.dtypes == np.object].columns.values.tolist() #have to translate back to list format
cols_cat_all = cols_cat_all[3:] #remove the first three cols, related to cust ID


In [8]:
# Cat cols-only those with limited tot unique value (otherwise too many categories created for dummies)

t = X[cols_cat_all].nunique().sort_values(ascending=False)
t = t.loc[lambda t: t < 20]  # change value to what is desired here, from 20
cols_cat_all_red = t.index.values.tolist()
X[cols_cat_all_red].nunique().to_frame('tot unique') #updated reduced list 


Unnamed: 0,tot unique
Education,12
Credit_card,10
Employment,5
Marital,5
Services,4
Area,3
Complication_Risk,3
Initial_Admin,3
HighBlood,2
Hyperlipidemia,2


In [9]:
# 1. Create dummies-for categorical cols (use reduced col list)

X_cat = pd.get_dummies(X[cols_cat_all_red], drop_first=True) # drop first as it is redundant
print('number of cat cols chosen: ', X[cols_cat_all_red].shape[1])
print('total cat cols after one hot encoding: ', X_cat.shape[1])
X_cat.head(3)

number of cat cols chosen:  22
total cat cols after one hot encoding:  51


Unnamed: 0,Education_Associate's Degree,Education_Bachelor's Degree,Education_Doctorate Degree,Education_GED or Alternative Credential,Education_Master's Degree,Education_No Schooling Completed,Education_Nursery School to 8th Grade,Education_Professional School Degree,Education_Regular High School Diploma,"Education_Some College, 1 or More Years, No Degree","Education_Some College, Less than 1 Year",Credit_card_Diners Club / Carte Blanche,Credit_card_Discover,Credit_card_JCB 15 digit,Credit_card_JCB 16 digit,Credit_card_Maestro,Credit_card_Mastercard,Credit_card_VISA 13 digit,Credit_card_VISA 16 digit,Credit_card_Voyager,Employment_Part Time,Employment_Retired,Employment_Student,Employment_Unemployed,Marital_Married,Marital_Never Married,Marital_Separated,Marital_Widowed,Services_CT Scan,Services_Intravenous,Services_MRI,Area_Suburban,Area_Urban,Complication_Risk_Low,Complication_Risk_Medium,Initial_Admin_Emergency Admission,Initial_Admin_Observation Admission,HighBlood_Yes,Hyperlipidemia_Yes,Reflux_esophagitis_Yes,Allergic_rhinitis_Yes,Anxiety_Yes,BackPain_Yes,Arthritis_Yes,Diabetes_Yes,Overweight_Yes,Stroke_Yes,ReAdmin_Yes,Gender_Male,Asthma_Yes,SeniorCitizen_Yes
0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,1,1,1,1,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,1,0,0


In [10]:
# 2. For Naive Bayes, choose all cat columns and label encode 

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

X_cat_nb = X[cols_cat_all].apply(le.fit_transform)
print('number of cat cols in NB: ', X[cols_cat_all].shape[1])
print('total cat cols in NB after one hot encoding: ', X_cat_nb.shape[1])
X_cat_nb.head(3) 

number of cat cols in NB:  29
total cat cols in NB after one hot encoding:  29


Unnamed: 0,zip,City,State,County,Area,Timezone,Job,Education,Employment,Marital,fav_color,Credit_card,ReAdmin,Gender,SeniorCitizen,Initial_Admin,HighBlood,Stroke,Complication_Risk,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services
0,988,4505,38,122,1,15,459,9,3,0,126,2,0,0,0,1,1,0,2,0,1,1,0,1,1,1,0,1,0
1,5768,4564,16,524,2,2,514,10,0,0,102,6,1,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,2
2,3471,3490,35,1313,2,15,259,10,3,1,137,8,1,1,0,0,1,0,2,1,0,1,0,0,0,0,0,0,0


In [32]:
#2 Basket Set

cols_MB = ['HighBlood', 'Stroke', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma']

X_MB = X[cols_MB].apply(le.fit_transform) #label encode (to transform yes/no to 0/1)
print('number of cat cols in MB: ', X[cols_MB].shape[1])
print('total cat cols in MB after one hot encoding: ', X_MB.shape[1])
X_MB.head(3) 

number of cat cols in MB:  11
total cat cols in MB after one hot encoding:  11


Unnamed: 0,HighBlood,Stroke,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma
0,1,0,0,1,1,0,1,1,1,0,1
1,1,0,1,0,0,0,0,0,0,1,0
2,1,0,1,0,1,0,0,0,0,0,0


In [11]:
# Join Cat and Noncat together

X_all = pd.concat([X_cont, X_cat],axis=1)
X_all_nb = pd.concat([X_cont, X_cat_nb],axis=1)

print('total cols for final df: ', X_all.shape[1]) 
print('total cols for final df, Naive Bayes: ',X_all_nb.shape[1])
X_all_nb.head()

total cols for final df:  59
total cols for final df, Naive Bayes:  37


Unnamed: 0,Population,Children,Lat,Lng,Income,Age,Initial_Days,TotalCharge,zip,City,State,County,Area,Timezone,Job,Education,Employment,Marital,fav_color,Credit_card,ReAdmin,Gender,SeniorCitizen,Initial_Admin,HighBlood,Stroke,Complication_Risk,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services
0,15439,1,40.34008,-75.97606,83147.66,79,10.58577,1034.377162,988,4505,38,122,1,15,459,9,3,0,126,2,0,0,0,1,1,0,2,0,1,1,0,1,1,1,0,1,0
1,922,2,38.40441,-95.25956,54612.56,31,15.129562,1497.764653,5768,4564,16,524,2,2,514,10,0,0,102,6,1,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,2
2,3782,7,38.89156,-82.8231,7726.42,49,4.772177,395.125825,3471,3490,35,1313,2,15,259,10,3,1,137,8,1,1,0,0,1,0,2,1,0,1,0,0,0,0,0,0,0
3,1115,0,39.88561,-90.39943,57779.93,31,1.714879,114.424164,5373,164,14,239,0,2,506,10,3,2,55,9,0,0,0,0,0,1,2,0,1,0,0,0,0,0,1,1,0
4,17445,0,40.6371,-103.23146,70179.69,50,1.254807,95.720457,7302,5211,5,854,1,3,550,7,0,1,9,2,1,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1


# Unspervised Modeling Techniques

# 1. Clustering (k-means)

This section covers k-means clustering

In [20]:
# Load the K-Nearest Neighbors class
from sklearn.neighbors import KNeighborsClassifier

# Load the metrics libraries (used for all methods)
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix, classification_report

In [21]:
# Scale the data for cols_cont 

from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler() #MinMaxScaler()

X_train_std = X_train.copy() #copy train features df for KNN
X_test_std = X_test.copy()  #copy test features df for KNN
#transform cont cols only
X_train_std[cols_cont] = scaler.fit_transform(X_train_std[cols_cont]) 
X_test_std[cols_cont] = scaler.transform(X_test_std[cols_cont])

X_train_std.head(3)

Unnamed: 0,Population,Children,Lat,Lng,Income,Age,Initial_Days,TotalCharge,Education_Associate's Degree,Education_Bachelor's Degree,Education_Doctorate Degree,Education_GED or Alternative Credential,Education_Master's Degree,Education_No Schooling Completed,Education_Nursery School to 8th Grade,Education_Professional School Degree,Education_Regular High School Diploma,"Education_Some College, 1 or More Years, No Degree","Education_Some College, Less than 1 Year",Credit_card_Diners Club / Carte Blanche,Credit_card_Discover,Credit_card_JCB 15 digit,Credit_card_JCB 16 digit,Credit_card_Maestro,Credit_card_Mastercard,Credit_card_VISA 13 digit,Credit_card_VISA 16 digit,Credit_card_Voyager,Employment_Part Time,Employment_Retired,Employment_Student,Employment_Unemployed,Marital_Married,Marital_Never Married,Marital_Separated,Marital_Widowed,Services_CT Scan,Services_Intravenous,Services_MRI,Initial_Admin_Emergency Admission,Initial_Admin_Observation Admission,Complication_Risk_Low,Complication_Risk_Medium,Area_Suburban,Area_Urban,HighBlood_Yes,Asthma_Yes,Gender_Male,Stroke_Yes,Overweight_Yes,Arthritis_Yes,Diabetes_Yes,Hyperlipidemia_Yes,BackPain_Yes,Anxiety_Yes,Allergic_rhinitis_Yes,Reflux_esophagitis_Yes,SeniorCitizen_Yes
2228,-0.072058,1.320128,-0.912823,-0.197392,-1.194997,1.092357,-1.010361,-1.010111,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,1,1
5910,0.822813,0.3959,-0.46153,-1.631181,-0.56145,1.236218,1.123644,1.052497,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,1,0,0,1,1,0,1,0,0
1950,-0.574062,0.3959,0.450261,0.120702,-0.864709,1.188264,-0.911589,-0.920343,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,0,0,0,1,1,0,1


In [22]:
# Train a K-Nearest Neighbors model

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_std, y_train) #Note: only X_train is std (not y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

In [23]:
# Print overall test-set accuracy, precision, recall, auc
y_pred = knn.predict(X_test_std)
y_prob = knn.predict_proba(X_test_std)[:, 1]

acc = accuracy_score(y_test, y_pred)*100
precision = precision_score(y_test, y_pred)*100
recall = recall_score(y_test, y_pred)*100
print('total accuracy = {:.1f}%'.format(acc))
print('total precision = {:.1f}%'.format(precision))
print('total recall = {:.1f}%'.format(recall))

auc = roc_auc_score(y_test, y_prob)
print('auc = {:.2f}%'.format(auc))
#confusion_matrix(y_test, y_pred)


total accuracy = 76.8%
total precision = 82.8%
total recall = 89.5%
auc = 0.67%


# 2. PCA

This section covers PCA Analysis

In [24]:
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
nb = model.fit(X_train_nb,y_train_nb)

In [25]:
# Print overall test-set accuracy, precision, recall, auc
y_pred = nb.predict(X_test_nb)
y_prob = nb.predict_proba(X_test_nb)[:, 1]

acc = accuracy_score(y_test_nb, y_pred)*100
precision = precision_score(y_test_nb, y_pred)*100
recall = recall_score(y_test_nb, y_pred)*100
print('total accuracy = {:.1f}%'.format(acc))
print('total precision = {:.1f}%'.format(precision))
print('total recall = {:.1f}%'.format(recall))

auc = roc_auc_score(y_test_nb, y_prob)
print('auc = {:.2f}%'.format(auc))
#confusion_matrix(y_test, y_pred)

total accuracy = 74.9%
total precision = 87.5%
total recall = 79.9%
auc = 0.75%


# 3. Market Basket

This section covers Market Basket Analysis

In [15]:
from mlxtend.frequent_patterns import apriori, association_rules


In [31]:
X_MB.head(3)
#X_MB.Complication_Risk.unique()

Unnamed: 0,ID,HighBlood,Stroke,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma
0,0,1,0,0,1,1,0,1,1,1,0,1
1,1,1,0,1,0,0,0,0,0,0,1,0
2,2,1,0,1,0,1,0,0,0,0,0,0


In [33]:
# Apriori algorithm

frequent_itemsets = apriori(X_MB, min_support=0.07, use_colnames=True)

In [39]:
# Freq item sets

frequent_itemsets.head(7)

Unnamed: 0,support,itemsets
0,0.409,(HighBlood)
1,0.1993,(Stroke)
2,0.7094,(Overweight)
3,0.3574,(Arthritis)
4,0.2738,(Diabetes)
5,0.3372,(Hyperlipidemia)
6,0.4114,(BackPain)


In [42]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(HighBlood),(Overweight),0.409,0.7094,0.296,0.723716,1.020181,0.005855,1.051818
1,(Stroke),(Overweight),0.1993,0.7094,0.1412,0.70848,0.998703,-0.000183,0.996843
2,(Arthritis),(Overweight),0.3574,0.7094,0.2544,0.711807,1.003394,0.00086,1.008354
3,(Diabetes),(Overweight),0.2738,0.7094,0.1927,0.703798,0.992104,-0.001534,0.981089
4,(Hyperlipidemia),(Overweight),0.3372,0.7094,0.2379,0.705516,0.994525,-0.00131,0.986811


In [43]:
# Get rules set

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(HighBlood),(Stroke),0.4090,0.1993,0.0830,0.202934,1.018234,0.001486,1.004559
1,(Stroke),(HighBlood),0.1993,0.4090,0.0830,0.416458,1.018234,0.001486,1.012780
2,(HighBlood),(Overweight),0.4090,0.7094,0.2960,0.723716,1.020181,0.005855,1.051818
3,(Overweight),(HighBlood),0.7094,0.4090,0.2960,0.417254,1.020181,0.005855,1.014164
4,(HighBlood),(Arthritis),0.4090,0.3574,0.1479,0.361614,1.011790,0.001723,1.006601
...,...,...,...,...,...,...,...,...,...
159,(Allergic_rhinitis),"(Anxiety, Overweight)",0.3941,0.2257,0.0921,0.233697,1.035432,0.003152,1.010436
160,"(Asthma, Reflux_esophagitis)",(Overweight),0.1193,0.7094,0.0847,0.709975,1.000810,0.000069,1.001982
161,"(Overweight, Reflux_esophagitis)",(Asthma),0.2906,0.2893,0.0847,0.291466,1.007487,0.000629,1.003057
162,(Asthma),"(Overweight, Reflux_esophagitis)",0.2893,0.2906,0.0847,0.292776,1.007487,0.000629,1.003076


In [35]:
# Filter rules set

rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
