In [None]:
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [2]:
# read in health indicator CSV file:
df = pd.read_csv("../data/heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [121]:
#unhealthy = df[df["HeartDisease"] == "Yes"]
#num_success = unhealthy.shape[0]
#smaller_healthy = df[df["HeartDisease"] == "No"].sample(n=num_success, random_state=1)
#df = unhealthy.append(smaller_healthy, ignore_index=True)

### Continuous Variables:
- BMI, PhysicalHealth, MentalHealth, SleepTime

In [3]:
df.describe() 

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


### Categorical Variables:
- Smoking: 1= At least 100cigarettes have been smoked in entire life, 0= Otherwise
- AlcoholDrinking: 1: More than 14 drinks per week (Male), More than 7 drinks per week (Female), 0: Otherwise
- Stroke: 1= Had a stroke, 0= Otherwise
- DiffWalking: 1= Have difficulty walking, 0: Otherwise
- Sex: 1 = 
- AgeCategory
- Race
- Diabetic
- PhysicalActivity

In [6]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

### Creating Dummy Variables:
- For variables with multiple classes.

In [7]:
# Dummy variables for Age, Race, General health, Diabetic

df_1 = df[["AgeCategory","Race","GenHealth","Diabetic"]]
df_dummy = pd.get_dummies(df_1)

# Removing Age, Race, General health, Diabetic from df

df = df.drop(["AgeCategory","Race","GenHealth","Diabetic"], axis=1)

# Joining df_dummy with df
for colname in df_dummy:
    df[colname] = df_dummy[colname]

In [8]:
df_dummy

Unnamed: 0,AgeCategory_18-24,AgeCategory_25-29,AgeCategory_30-34,AgeCategory_35-39,AgeCategory_40-44,AgeCategory_45-49,AgeCategory_50-54,AgeCategory_55-59,AgeCategory_60-64,AgeCategory_65-69,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
319791,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
319792,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
319793,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [9]:
bin_cols = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
mult_cols = ["Age", "Race", "GenHealth", "Diabetic"]
for colname in bin_cols:
    if("Sex" != colname):
        print(df[colname])
        df[colname] = df[colname].str.replace("No",  "0", regex=False)
        df[colname] = df[colname].str.replace("Yes", "1", regex=False)
    else:
        df[colname] = df[colname].str.replace("Female",  "0", regex=False)
        df[colname] = df[colname].str.replace("Male", "1", regex=False)
    df[colname] = df[colname].astype("int")
print(df.head(3))

0          No
1          No
2          No
3          No
4          No
         ... 
319790    Yes
319791     No
319792     No
319793     No
319794     No
Name: HeartDisease, Length: 319795, dtype: object
0         Yes
1          No
2         Yes
3          No
4          No
         ... 
319790    Yes
319791    Yes
319792     No
319793     No
319794     No
Name: Smoking, Length: 319795, dtype: object
0         No
1         No
2         No
3         No
4         No
          ..
319790    No
319791    No
319792    No
319793    No
319794    No
Name: AlcoholDrinking, Length: 319795, dtype: object
0          No
1         Yes
2          No
3          No
4          No
         ... 
319790     No
319791     No
319792     No
319793     No
319794     No
Name: Stroke, Length: 319795, dtype: object
0          No
1          No
2          No
3          No
4         Yes
         ... 
319790    Yes
319791     No
319792     No
319793     No
319794     No
Name: DiffWalking, Length: 319795, dtype: object


In [10]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,0,1,...,1,0,0,0,0,1,0,0,1,0
1,0,20.34,0,0,1,0.0,0.0,0,0,1,...,1,0,0,0,0,1,1,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,1,...,1,0,1,0,0,0,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
4,0,23.71,0,0,0,28.0,0.0,1,0,1,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,1,0,...,0,0,1,0,0,0,0,0,1,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,1,...,0,0,0,0,0,1,1,0,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [11]:
for colname in df:
    df[colname] = df[colname].astype(float)
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0.0,16.60,1.0,0.0,0.0,3.0,30.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,20.34,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,26.58,1.0,0.0,0.0,20.0,30.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,24.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,23.71,0.0,0.0,0.0,28.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1.0,27.41,1.0,0.0,0.0,7.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
319791,0.0,29.84,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
319792,0.0,24.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
319793,0.0,32.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]
print(np.shape(X))
print(np.shape(y))

test_prop = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_prop, random_state=1) # 70% training and 30% test

(319795, 41)
(319795,)


### kNN Classifer 
- 1: has heart disease, 0: no heart disease

In [227]:
# Training
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(223856,)
neigh = KNeighborsClassifier(n_neighbors=5,p=1)
neigh.fit(X_train, y_train)

X_test = np.array(X_test)
y_test = np.array(y_test).reshape(95939,)

# Predict on Test Set
neigh.score(X_test, y_test, sample_weight=None)

#(k=4,p=1): 68%
#(k=5,p=1): 70.43%
#(k=6,p=1): 69%
#(k=4,p=2): 66%
#(k=5,p=2): 68.9%
#(k=6,p=2): 68.1%

0.9081603935834228

In [229]:
# WEBSITE INTERFACE
# Predict Individual Observation 

kevin_heartdisease = np.array([[27.3,1,0,0,3,4,0,1,1,8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0]])

neigh.predict(kevin_heartdisease) # prediction: no heart disease

neigh.predict_proba(kevin_heartdisease) # 20% probability of having heart disease

array([[1., 0.]])

### Manifold TNSE

In [154]:
# Fit X into an embedded space

X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(X_train)
X_embedded.shape

(38322, 2)

### Support Vector Machines

In [None]:
clf = svm.SVC(kernel='linear',C=1).fit(X_train,y_train)
classifier_predictions = clf.predict(X_test)
classifier_predictions
#clf.accuracy_score(y_test,classifier_predictions)*100)

In [176]:
# Prediction Accuracy of linear C-Support Vector Classification:
accurate = 0
for i in range(len(y_test)):
    if classifier_predictions[i] == y_test[i]:
        accurate += 1

accuracy = accurate / len(y_test)
accuracy

0.761629322942036

In [None]:
#################### PICKLE CODE####################

# Save model: 
filehandler = open("LogRegr_pickle.obj", 'wb') 
pickle.dump(logisticRegr, filehandler)

# Load model, under a different name: 
filehandler = open("LogRegr_pickle.obj", 'rb') 
model_ = pickle.load(filehandler)

# Test model on novel data:
model_.predict_proba(kevin_heartdisease.reshape(1,-1))

In [186]:
clf = svm.SVC(kernel='rbf',C=1).fit(X_train,y_train)
classifier_predictions = clf.predict(X_test)
classifier_predictions

array([1., 0., 0., ..., 0., 0., 1.])

In [197]:
# Prediction Accuracy of linear C-Support Vector Classification:
accurate = 0
for i in range(len(y_test)):
    if classifier_predictions[i] == y_test[i]:
        accurate += 1

accuracy = accurate / len(y_test)
accuracy

0.7534096444227959

### Quadratic Discriminant Analysis

In [209]:
clf = QDA()
clf.fit(X_train, y_train)
classifier_predictions = clf.predict(X_test)



In [210]:
# Prediction Accuracy of linear C-Support Vector Classification:
accurate = 0
for i in range(len(y_test)):
    if classifier_predictions[i] == y_test[i]:
        accurate += 1

accuracy = accurate / len(y_test)
accuracy

0.726619581100828