In [49]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import random
import sys
import warnings
warnings.filterwarnings(action="ignore")

In [23]:
heart_data = pd.read_csv('https://raw.githubusercontent.com/scherala/LearningML/master/Data/heart.csv')
# Separate features
nominal_features = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
nominal_feature_index = [1,3,5,9,10]
target_column = ['DEATH_EVENT']
numeric_features = set(heart_data.columns) - set(nominal_features) - set(target_column)
heart_data_raw = heart_data.copy()
heart_data_raw.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


### Adding sys.maxsize in random positions in the place of missing values

In [24]:
heart_data.shape[1]-1

12

In [25]:
index = [(row, col) for row in range(heart_data.shape[0]) for col in range(heart_data.shape[1]-1)]
for row, col in random.sample(index, int(round(.1*len(index)))):
    if col not in nominal_feature_index:
        heart_data.iat[row, col] = sys.maxsize

In [26]:
heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,9223372036854775807,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


### Sklearn SimpleImputer

In [27]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=sys.maxsize, strategy='mean')
transformed_values = imputer.fit_transform(heart_data)

count = 0
for row in transformed_values:
    for value in row:
        if value == sys.maxsize:
            count += 1
print("Missing Value Count: " + str(count))

Missing Value Count: 0


In [28]:
transformed_df = pd.DataFrame(transformed_values, columns = heart_data.columns)

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

### Decision tree classification data without missing values

In [40]:
X = heart_data_raw.iloc[:,:-2]
y = heart_data_raw.iloc[:,-1].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [41]:
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7666666666666667

### Decision tree classification using imputed values

In [43]:
X = transformed_df.iloc[:,:-2]
y = transformed_df.iloc[:,-1].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7333333333333333

### KNN based classification without missing values

In [53]:
X = heart_data_raw.iloc[:,:-2]
y = heart_data_raw.iloc[:,-1].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf = KNeighborsClassifier(n_neighbors=3)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6333333333333333

### KNN based classification without missing values

In [54]:
X = transformed_df.iloc[:,:-2]
y = transformed_df.iloc[:,-1].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf = KNeighborsClassifier(n_neighbors=3)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.5