In [601]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [602]:
heart_df = pd.read_csv("heart.csv")
o2_sat_df = pd.read_csv("o2saturation.csv")

# not going to use the o2 sat df at this time as the data isn't given an id so I can't associate it with patients
# there isn't even an equal number of columns so I can't assume it goes with the same order as in heart_df

# No description of slp, caa or thall. Going to assume they will also need one-hot encoding

In [603]:
# Raw output data is separated by all 1's and all 0's. 
# Going to shuffle this to prevent highly correlated training data

heart_df=heart_df.sample(frac=1)

In [604]:
heart_df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
205,52,1,0,128,255,0,1,161,1,0.0,2,1,3,0
235,51,1,0,140,299,0,1,173,1,1.6,2,0,3,0
158,58,1,1,125,220,0,1,144,0,0.4,1,4,3,1
43,53,0,0,130,264,0,0,143,0,0.4,1,0,2,1
84,42,0,0,102,265,0,0,122,0,0.6,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2,0
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2,1
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
45,52,1,1,120,325,0,1,172,0,0.2,2,0,2,1


# Clean up data
This shouldn't need much cleaning at all - according to the accompanied information at https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset it seems that most features are numeric where appropriate and categorical where appropriate. I will need to give cp and restecg columns one-hot encoding as they are categorical with more than 2 categories. L-1 columns are needed for L categories for logit or there is over-prediction, so will need to drop_first.

I'm also going to normalize the data by dividing it by its max value. There are no negative values here so no need to use minmax scaling.

In [605]:
# restecg should probably be one-hot encoded as it appears to be categorical with values 0, 1, 2
# It could be the case that the labeling is intentional however and actually in fact numeric..

In [606]:
outputs = heart_df['output']
heart_df = heart_df.drop(columns=['output'])

In [607]:
new_restecg = pd.get_dummies(heart_df['restecg'], drop_first = True, prefix = 'restecg')
new_cp = pd.get_dummies(heart_df['cp'], drop_first = True, prefix = 'cp')
new_caa = pd.get_dummies(heart_df['caa'], drop_first = True, prefix = 'caa')
new_slp = pd.get_dummies(heart_df['slp'], drop_first = True, prefix = 'slp')
new_thall = pd.get_dummies(heart_df['thall'], drop_first = True, prefix = 'thall')

In [608]:
heart_df = heart_df.drop(columns=['cp','restecg','caa','slp','thall'])

In [609]:
heart_df=heart_df.join(new_restecg)
heart_df=heart_df.join(new_cp)
heart_df=heart_df.join(new_caa)
heart_df=heart_df.join(new_slp)
heart_df=heart_df.join(new_thall)

In [610]:
heart_df

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,restecg_1,restecg_2,...,cp_3,caa_1,caa_2,caa_3,caa_4,slp_1,slp_2,thall_1,thall_2,thall_3
205,52,1,128,255,0,161,1,0.0,1,0,...,0,1,0,0,0,0,1,0,0,1
235,51,1,140,299,0,173,1,1.6,1,0,...,0,0,0,0,0,0,1,0,0,1
158,58,1,125,220,0,144,0,0.4,1,0,...,0,0,0,0,1,1,0,0,0,1
43,53,0,130,264,0,143,0,0.4,0,0,...,0,0,0,0,0,1,0,0,1,0
84,42,0,102,265,0,122,0,0.6,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,46,1,150,231,0,147,0,3.6,1,0,...,0,0,0,0,0,1,0,0,1,0
131,49,0,134,271,0,162,0,0.0,1,0,...,0,0,0,0,0,1,0,0,1,0
271,61,1,134,234,0,145,0,2.6,1,0,...,1,0,1,0,0,1,0,0,1,0
45,52,1,120,325,0,172,0,0.2,1,0,...,0,0,0,0,0,0,1,0,1,0


In [611]:
def max_scaler(df):
    for column in df.columns:
        max_val = df[column].max()
        df[column] = df[column] / max_val
    return df

In [612]:
heart_df = max_scaler(heart_df)

In [613]:
heart_df

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,restecg_1,restecg_2,...,cp_3,caa_1,caa_2,caa_3,caa_4,slp_1,slp_2,thall_1,thall_2,thall_3
205,0.675325,1.0,0.640,0.452128,0.0,0.797030,1.0,0.000000,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
235,0.662338,1.0,0.700,0.530142,0.0,0.856436,1.0,0.258065,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
158,0.753247,1.0,0.625,0.390071,0.0,0.712871,0.0,0.064516,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
43,0.688312,0.0,0.650,0.468085,0.0,0.707921,0.0,0.064516,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
84,0.545455,0.0,0.510,0.469858,0.0,0.603960,0.0,0.096774,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.597403,1.0,0.750,0.409574,0.0,0.727723,0.0,0.580645,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
131,0.636364,0.0,0.670,0.480496,0.0,0.801980,0.0,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
271,0.792208,1.0,0.670,0.414894,0.0,0.717822,0.0,0.419355,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
45,0.675325,1.0,0.600,0.576241,0.0,0.851485,0.0,0.032258,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# Assembling the test and training data and fitting the model

In [614]:
patient_info = []
indices = heart_df.index.values
train_part = round(0.8*len(indices))
for vals in heart_df.index.values:
     patient = np.asarray(heart_df.loc[heart_df.index == vals].values[0])
     patient_info.append(patient)
    
    

In [615]:
train_x = patient_info[:train_part]
test_x = patient_info[train_part:]
train_y = outputs[:train_part]
len(train_x) == len(train_y)
test_y = outputs[train_part:]

In [616]:
logit = LogisticRegression(random_state = 0, max_iter = 1000)

In [617]:
logit.fit(train_x, train_y)

LogisticRegression(max_iter=1000, random_state=0)

In [618]:
predictions = logit.predict(test_x)

In [619]:
accuracy_score(test_y, predictions)

0.8524590163934426

In [620]:
predictions

array([0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [621]:
test_y.values

array([0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1])