# Import Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Data

In [2]:
kgheart = pd.read_csv('/Users/silviacatalina/Google Drive/BethelTech/GitHub/FinalProject/Data/CHD_preprocessed.csv')
kgheart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4133 entries, 0 to 4132
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4133 non-null   int64  
 1   age              4133 non-null   int64  
 2   education        4133 non-null   int64  
 3   currentSmoker    4133 non-null   int64  
 4   cigsPerDay       4133 non-null   float64
 5   BPMeds           4133 non-null   float64
 6   prevalentStroke  4133 non-null   int64  
 7   prevalentHyp     4133 non-null   int64  
 8   diabetes         4133 non-null   int64  
 9   totChol          4133 non-null   float64
 10  sysBP            4133 non-null   float64
 11  diaBP            4133 non-null   float64
 12  BMI              4133 non-null   float64
 13  heartRate        4133 non-null   float64
 14  glucose          4133 non-null   float64
 15  TenYearCHD       4133 non-null   int64  
dtypes: float64(8), int64(8)
memory usage: 516.8 KB


# Data Exploration

In [None]:
sns.pairplot(kgheart)

In [None]:
plt.figure(figsize=(16,14))
sns.heatmap(kgheart.corr(), cmap ="YlGnBu",linewidths = .1, annot = True)

# Bar Charts (Categorical Variables)

In [None]:
fig, axes = plt.subplots(2, 3, figsize = (15,10), sharey = True)
kgheart.male.value_counts().plot.bar(title = 'Gender', ax=axes[0,0])
kgheart.education.value_counts().plot.bar(title ='Education', ax=axes[0,1])
kgheart.currentSmoker.value_counts().plot.bar(title = 'Smoker', ax=axes[0,2])
kgheart.prevalentStroke.value_counts().plot.bar(title = 'Prevalent Stroke', ax=axes[1,0])
kgheart.prevalentHyp.value_counts().plot.bar(title = 'Prevalent Hypertension', ax=axes[1,1])
kgheart.diabetes.value_counts().plot.bar(title = 'Prevalent Diabetes', ax=axes[1,2])
for ax in fig.axes:
    ax.tick_params(labelrotation=0)

# Histograms (Continuous Variables)

## Systolic and Diastolic Blood Pressure (BP)
* Red vertical lines represent the thresholds for a dignosis of __stage 1 hypertension__

In [None]:
plt.figure(figsize = (15, 7))
plt.axvline(90, color ='red')
plt.axvline(130, color = 'red')
sns.histplot(kgheart.sysBP, kde = True)
sns.histplot(kgheart.diaBP, kde = True, color ='orange')

## Risk of Heart Disease across systolic BP

In [None]:
plt.figure(figsize = (13, 5))
sns.histplot(data = kgheart, x = 'sysBP', hue='TenYearCHD', multiple= 'stack')

__*People with risk for heart disease are evenly spread across the systolic BP readings*__

## Systolic BP separated by gender and contrast with smoking status

In [None]:
sns.displot(kgheart, x='sysBP', hue = 'currentSmoker', col = 'male', kind='hist')

__*More males are current smokers than females*__

In [None]:
fig, axes = plt.subplots(3,3, figsize = (15,15), sharey = False)
sns.histplot(kgheart.age, bins=20, kde = True, ax=axes[0,0])
sns.histplot(kgheart.cigsPerDay, kde = True, ax=axes[0,1])
sns.histplot(kgheart.glucose,kde = True, ax=axes[0,2])
sns.histplot(kgheart.totChol, kde = True, ax=axes[1,0])
sns.histplot(kgheart.sysBP, kde = True, ax=axes[1,1])
sns.histplot(kgheart.diaBP, kde = True, ax=axes[1,2])
sns.histplot(kgheart.BMI, kde = True, ax=axes[2,0])
sns.histplot(kgheart.heartRate, bins = 20, kde = True, ax=axes[2,1])
sns.histplot(kgheart.glucose, kde = True, ax=axes[2,2])

# K-means Clustering

## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.cluster import KMeans

## Data Wrangling
* KMeans does not accept string data, but this dataset is fully numeric
* Subsetting to continuous variables and the predicted variable

In [None]:
kgheartTrimmed = kgheart[['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD']].copy()

In [None]:
kgheartTrimmed.head()

## Perform k-Means Clustering

### Testing 2 clusters

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(kgheartTrimmed)

In [None]:
kgheartTrimmed['Group'] = kmeans.labels_
kgheartTrimmed.head()

### Investigate Means by Category

In [None]:
kgheartTrimmed.groupby('Group').mean()

### Testing 3 Clusters

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(kgheartTrimmed)

In [None]:
kgheartTrimmed['Group'] = kmeans.labels_
kgheartTrimmed.head()

### Investigate Means by Category

In [None]:
kgheartTrimmed.groupby('Group').mean()

__*The 3 clusters do not seem to differ much from each other*__

### Testing 4 Clusters

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(kgheartTrimmed)

In [None]:
kgheartTrimmed['Group'] = kmeans.labels_
kgheartTrimmed.head()

### Investigate Means by Category

In [None]:
kgheartTrimmed.groupby('Group').mean()

* __*The group with the highest percentage of TenYearCHD risk (54%) has the highest average blood sugar, indicative of unmanaged diabetes, the highest average BMI, the highest avverage systolic blood pressure.*__
* __*The group with 2nd highest percentage of TenYearCHD risk (19%) has a high systolic blood pressure and the highest average total cholesterol.*__
* __*The group with the 3rd highest percentage (16%) of TenYearCHD risk has an average total cholesterol similar to the group with the highest risk, but a lower average systolic blood pressure*__

### Testing 5 Clusters

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(kgheartTrimmed)

In [None]:
kgheartTrimmed['Group'] = kmeans.labels_
kgheartTrimmed.head()

### Investigate Means by Category

In [None]:
kgheartTrimmed.groupby('Group').mean()

__*Adding a 5th cluster did not create a new group with a significant difference*__