In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as pre # need OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA # Use to check feature importance?
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
#sklearn.metrics.f1_score
#sklearn.metrics.f1_score

In [2]:
df = pd.read_csv('cleveland.csv').replace('?','0.0').astype(float, errors='ignore')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


In [3]:
# categories: sex (1,0), cp (1,2,3,4), fbs (1,0), exang (1,0), thal (3,6,7), num (0,{1,2,3,4})

In [4]:
# Change num to either 0 or 1.

df['num'] = [0.0 if df['num'][i] == 0.0 else 1.0 for i in range(df.shape[0])]

In [5]:
# Prepare to manually one-hot encode.

sex0list = [1.0 if df['sex'][i]==0.0 else 0.0 for i in range(df.shape[0])]
sex1list = [1.0 if df['sex'][i]==1.0 else 0.0 for i in range(df.shape[0])]

cp1list = [1.0 if df['cp'][i]==1.0 else 0.0 for i in range(df.shape[0])]
cp2list = [1.0 if df['cp'][i]==2.0 else 0.0 for i in range(df.shape[0])]
cp3list = [1.0 if df['cp'][i]==3.0 else 0.0 for i in range(df.shape[0])]
cp4list = [1.0 if df['cp'][i]==4.0 else 0.0 for i in range(df.shape[0])]

fbs0list = [1.0 if df['fbs'][i]==0.0 else 0.0 for i in range(df.shape[0])]
fbs1list = [1.0 if df['fbs'][i]==1.0 else 0.0 for i in range(df.shape[0])]

exang0list = [1.0 if df['exang'][i]==0.0 else 0.0 for i in range(df.shape[0])]
exang1list = [1.0 if df['exang'][i]==1.0 else 0.0 for i in range(df.shape[0])]

thal3list = [3.0 if df['thal'][i]==3.0 else 0.0 for i in range(df.shape[0])]
thal6list = [6.0 if df['thal'][i]==6.0 else 0.0 for i in range(df.shape[0])]
thal7list = [7.0 if df['thal'][i]==7.0 else 0.0 for i in range(df.shape[0])]

In [6]:
# 'graft' the new columns in.

df['sex0'] = sex0list
df['sex1'] = sex1list

df['cp1'] = cp1list
df['cp2'] = cp2list
df['cp3'] = cp3list
df['cp4'] = cp4list

df['fbs0'] = fbs0list
df['fbs1'] = fbs1list

df['exang0'] = exang0list
df['exang1'] = exang1list

df['thal3'] = thal3list
df['thal6'] = thal6list
df['thal7'] = thal7list

In [7]:
# Now drop the original columns. This concludes our manual one-hot encoding.
df = df.drop(columns=['sex', 'cp', 'fbs', 'exang', 'thal'])
Xdf = df.drop(columns=['num'])
Ydf = df['num']
X = Xdf.to_numpy()
y = Ydf.to_numpy()

In [8]:
# Prepare to standardize the data.
myscaler = pre.StandardScaler(copy=False)

In [9]:
# scale the data
myscaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
# What are the more important features?
pca1 = PCA(n_components=X_train.shape[1])

In [11]:
pca1.fit(X_train)

PCA(n_components=21)

In [12]:
pca1.singular_values_

array([3.19912917e+01, 2.29432064e+01, 2.23251783e+01, 1.88295744e+01,
       1.77111167e+01, 1.71706598e+01, 1.62934586e+01, 1.55605655e+01,
       1.45884470e+01, 1.42813839e+01, 1.38279545e+01, 1.30689360e+01,
       1.23990294e+01, 1.15752502e+01, 9.43681023e+00, 8.88149597e+00,
       1.87709399e+00, 5.58870594e-15, 4.18907460e-15, 3.29201989e-15,
       2.49163019e-15])

In [13]:
# Try running the analysis after projecting onto all but the last 5 components.
pca2 = PCA(n_components=16)

In [14]:
pca2.fit(X_train) # fit according to the training data
pca2.transform(X_train) # transform the training data
pca2.transform(X_test) # transform the test data for later use.

print("done transforming.")

done transforming.


In [15]:
# We can't use the KNeighborsClassifier from sklearn in the final analysis, but let's use it for now
# Just so we can see if we can get good results.

from sklearn.neighbors import KNeighborsClassifier

In [16]:
knn = KNeighborsClassifier(n_neighbors=5)

In [17]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [18]:
preds = knn.predict(X_test)

In [19]:
PrecisionScore = precision_score(preds,y_test)
RecallScore = recall_score(preds,y_test)
F1Score = f1_score(preds,y_test)
AccuracyScore = accuracy_score(preds,y_test)

print("F1 score: " + str(F1Score))
print("Recall score: " + str(RecallScore))
print("Precision score: " + str(PrecisionScore))
print("Accuracy: " + str(AccuracyScore))

F1 score: 0.7945205479452055
Recall score: 0.8055555555555556
Precision score: 0.7837837837837838
Accuracy: 0.8026315789473685
