In [322]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [323]:
heart_df = pd.read_csv("heart.csv")
o2_sat_df = pd.read_csv("o2saturation.csv")

# not going to use the o2 sat df at this time as the data isn't given an id so I can't associate it with patients
# there isn't even an equal number of columns so I can't assume it goes with the same order as in heart_df

In [324]:
heart_df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# Clean up data
This shouldn't need much cleaning at all - according to the accompanied information at https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset it seems that most features are numeric where appropriate and categorical where appropriate. I will need to give cp and restecg columns one-hot encoding as they are categorical with more than 2 categories. L-1 columns are needed for L categories for logit or there is over-prediction, so will need to drop_first.

In [325]:
# restecg should probably be one-hot encoded as it appears to be categorical with values 0, 1, 2
# It could be the case that the labeling is intentional however and actually in fact numeric..

In [326]:
outputs = heart_df['output']
heart_df = heart_df.drop(columns=['output'])

In [327]:
new_restecg = pd.get_dummies(heart_df['restecg'], drop_first = True, prefix = 'restecg')
new_cp = pd.get_dummies(heart_df['cp'], drop_first = True, prefix = 'cp')

In [328]:
heart_df = heart_df.drop(columns=['cp','restecg'])

In [329]:
heart_df=heart_df.join(new_restecg)
heart_df=heart_df.join(new_cp)

In [330]:
heart_df

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,slp,caa,thall,restecg_1,restecg_2,cp_1,cp_2,cp_3
0,63,1,145,233,1,150,0,2.3,0,0,1,0,0,0,0,1
1,37,1,130,250,0,187,0,3.5,0,0,2,1,0,0,1,0
2,41,0,130,204,0,172,0,1.4,2,0,2,0,0,1,0,0
3,56,1,120,236,0,178,0,0.8,2,0,2,1,0,1,0,0
4,57,0,120,354,0,163,1,0.6,2,0,2,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,1,0,3,1,0,0,0,0
299,45,1,110,264,0,132,0,1.2,1,0,3,1,0,0,0,1
300,68,1,144,193,1,141,0,3.4,1,2,3,1,0,0,0,0
301,57,1,130,131,0,115,1,1.2,1,1,3,1,0,0,0,0


# Pre-process data

In [331]:
patient_numerical = []
patient_categorical = []
indices = heart_df.index.values
train_part = round(0.8*len(indices))
for i in range(len(indices)):
    patient_num = np.asarray(heart_df.loc[heart_df.index == i].values[0][:-5])
    patient_cat = np.asarray(heart_df.loc[heart_df.index == i].values[0][-5:])
    patient_numerical.append(patient_num)
    patient_categorical.append(patient_cat)
    
    

In [332]:
X_num = patient_numerical

len(X_num)

303

In [333]:
# Trying first standard scaling

scaler = preprocessing.StandardScaler().fit(X_num)
X_num = scaler.transform(X_num)

test_x = X_num[train_part:]

# Can just try scaling to a range

In [334]:
# Reinserting categorical data back

train_x = []
test_x =[]
for i in range(train_part):
    current_list = list(X_num[i])
    for j in range(len(patient_categorical[i])):
        current_list.append(patient_categorical[i][j])
    train_x.append(current_list)

for i in np.arange(train_part, len(X_num)):
    current_list = list(X_num[i])
    for j in range(len(patient_categorical[i])):
        current_list.append(patient_categorical[i][j])
    test_x.append(current_list)


In [335]:
logit = LogisticRegression(random_state = 0, max_iter = 1000)

In [336]:
logit.fit(train_x, train_y)

LogisticRegression(max_iter=1000, random_state=0)

In [337]:
predictions = logit.predict(test_x)

In [338]:
accuracy_score(test_y, predictions)

0.5737704918032787

In [339]:
predictions

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1])

In [340]:
outputs.values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,