# UCI Heart Disease K-Means Project

## Introduction - Dori

### Imports and Reading CSV

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


## Data Cleaning - Shania/Tiffany
Removing categorical variables and id.

Removing entries with missing data.

Splitting data into 80-20 Train and Test

In [3]:
df = df.drop(columns='id', axis=2)

In [9]:
print(df.isna().sum())

# dropping rows with null values
df = df.dropna()
df.info()

age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 299 entries, 0 to 748
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       299 non-null    int64  
 1   sex       299 non-null    object 
 2   dataset   299 non-null    object 
 3   cp        299 non-null    object 
 4   trestbps  299 non-null    float64
 5   chol      299 non-null    float64
 6   fbs       299 non-null    object 
 7   restecg   299 non-null    object 
 8   thalch    299 non-null    float64
 9   exang     299 non-null    object 
 10  oldpeak   299 non-null    float64
 11  slope     299 non-null    object 
 12  ca        299 non-null    float64
 13  thal      299 non-null    object 
 14  num       299 non-null    int64  
dtypes: f

In [28]:
# looking at number of unique values for categorical data
print(df.select_dtypes('object').nunique())

X = df.drop(['num'], axis=1)
y = df['num']

# Get categorical columns
categorical = list(X.select_dtypes(['object']).columns)
# Get numerical colummns
numerical = list(X.select_dtypes(['int64', 'float64']).columns)

# Scale numerical values
num_transform = Pipeline(steps=[('scaler', StandardScaler())])
# Encode categorical values
cat_transform = Pipeline(steps=[('categories', OneHotEncoder(sparse=False))])
# Transform the dataset into the scaled and encoded version
preprocessor = ColumnTransformer(transformers=[('cont', num_transform, numerical),
                                               ('cat', cat_transform, categorical)])
X = pd.DataFrame(preprocessor.fit_transform(X))
X

sex        2
dataset    3
cp         4
fbs        2
restecg    3
exang      2
slope      3
thal       3
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.940446,0.749760,-0.262867,0.029124,1.069475,-0.718306,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.384143,1.596354,0.747722,-1.790447,0.380309,2.487269,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.384143,-0.661231,-0.339138,-0.880662,1.327912,1.418744,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.943588,-0.096835,0.061285,1.632079,2.103224,-0.718306,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.499891,-0.096835,-0.815830,0.982232,0.294163,-0.718306,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1.495068,0.693320,-1.025575,-0.360784,2.017078,1.418744,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
295,0.274900,-0.096835,-2.207774,-1.487186,0.121872,0.350219,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
296,0.274900,-0.096835,-0.205664,1.068878,-0.911877,0.350219,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
297,-0.834344,1.031958,-0.396341,-2.223678,0.380309,-0.718306,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [30]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

## K - Means - Cory
Running K-means with K=5 (num column [0=no heart disease; 1,2,3,4 = stages of heart disease ])

## PCA - Shahab

## K-Means after PCA - Cory

## Comparative Data Analysis - Zarif

## Conclusion - Dori