In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression




In [3]:
cn = ['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','exang','oldpeak','slope','num_mjr_vess','thal','dx']


df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', names=cn)

In [4]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [17]:
df.shape

(303, 15)

In [7]:
df.dtypes

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess        object
thal                object
dx                   int64
dtype: object

In [12]:
df['num_mjr_vess'].value_counts()

0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: num_mjr_vess, dtype: int64

In [13]:
df['thal'].value_counts()

3.0    166
7.0    117
6.0     18
?        2
Name: thal, dtype: int64

In [25]:
df['hrt_dx'] = df['dx'].apply(lambda x: 1 if x >= 1 else 0)

df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,hrt_dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0,0


In [26]:
df['sex'] = df['sex'].astype(object)
df['chest_pain_type'] = df['chest_pain_type'].astype(object)
df['restecg'] = df['restecg'].astype(object)
df['exang'] = df['exang'].astype(object)
df['slope'] = df['slope'].astype(object)

In [27]:
df['sex'].value_counts()
# 1 = men, 0 = women

1.0    206
0.0     97
Name: sex, dtype: int64

In [28]:
df.dtypes

age                float64
sex                 object
chest_pain_type     object
restbps            float64
chol               float64
blood_sugar        float64
restecg             object
max_heartrate      float64
exang               object
oldpeak            float64
slope               object
num_mjr_vess        object
thal                object
dx                   int64
heart_dx             int64
hrt_dx               int64
dtype: object

In [84]:
# Should I reduce down the features based on the smf results...original results?

features = df[['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','exang','oldpeak','slope','num_mjr_vess','thal']]

X = features

y = df['hrt_dx']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.20,
                                                    random_state=88
                                                   )
X_test.shape

(61, 13)

In [32]:
X.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


### Preprocessing the Data

In [68]:
from sklearn.impute import SimpleImputer

cat_imputed = SimpleImputer(missing_values='?', strategy='most_frequent').fit_transform(df[['sex', 'chest_pain_type', 'restecg', 'exang', 'slope', 'num_mjr_vess','thal']])

enc = OneHotEncoder(sparse=False)

enc.fit_transform(cat_imputed)


enc.categories_

[array([0.0, 1.0], dtype=object),
 array([1.0, 2.0, 3.0, 4.0], dtype=object),
 array([0.0, 1.0, 2.0], dtype=object),
 array([0.0, 1.0], dtype=object),
 array([1.0, 2.0, 3.0], dtype=object),
 array(['0.0', '1.0', '2.0', '3.0'], dtype=object),
 array(['3.0', '6.0', '7.0'], dtype=object)]

In [69]:
SimpleImputer(strategy='mean').fit_transform(df[['age','restbps','chol','blood_sugar','max_heartrate','oldpeak','slope']]

)

array([[ 63. , 145. , 233. , ..., 150. ,   2.3,   3. ],
       [ 67. , 160. , 286. , ..., 108. ,   1.5,   2. ],
       [ 67. , 120. , 229. , ..., 129. ,   2.6,   2. ],
       ...,
       [ 57. , 130. , 131. , ..., 115. ,   1.2,   2. ],
       [ 57. , 130. , 236. , ..., 174. ,   0. ,   2. ],
       [ 38. , 138. , 175. , ..., 173. ,   0. ,   1. ]])

In [70]:
StandardScaler().fit_transform(df[['age','restbps','chol','blood_sugar','max_heartrate','oldpeak','slope']]

)



array([[ 0.94872647,  0.75752504, -0.2649003 , ...,  0.01719733,
         1.08733806,  2.27457861],
       [ 1.39200191,  1.61121989,  0.76041519, ..., -1.82190531,
         0.39718162,  0.64911323],
       [ 1.39200191, -0.6652997 , -0.34228261, ..., -0.90235399,
         1.34614673,  0.64911323],
       ...,
       [ 0.28381332, -0.0961698 , -2.23814899, ..., -1.51538821,
         0.13837295,  0.64911323],
       [ 0.28381332, -0.0961698 , -0.20686358, ...,  1.06811312,
        -0.89686172,  0.64911323],
       [-1.82174501,  0.35913411, -1.38694368, ...,  1.02432497,
        -0.89686172, -0.97635214]])

In [71]:
# Begin making the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

imp_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'),
             OneHotEncoder(sparse=False))

imp_mean = make_pipeline(SimpleImputer(strategy='mean'),
                       StandardScaler())

preprocessor = make_column_transformer((imp_ohe, ['sex', 'chest_pain_type', 'restecg', 'exang', 'slope', 'num_mjr_vess','thal']),
                       (imp_mean, ['age','restbps','chol','blood_sugar','max_heartrate','oldpeak','slope'])
                       )

In [72]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()  

In [73]:
pipe = make_pipeline(preprocessor, clf)
pipe

In [74]:
pipe.fit(X_train, y_train)

In [75]:
y_pred = pipe.predict(X_test)
# y_pred

In [76]:
from sklearn import set_config
set_config(display='diagram')

pipe

In [77]:
from sklearn import metrics

print('R2 Metric', metrics.r2_score(y_test, y_pred))

# what number should I be calculating to show accuracy? Is RSME the best?

R2 Metric 0.4455085480544302


In [78]:
from sklearn.model_selection import cross_val_score

- (cross_val_score(pipe, X, y, cv=10, scoring='neg_root_mean_squared_error'))

array([0.3108655 , 0.39038551, 0.3446702 , 0.25022923, 0.34474565,
       0.37081541, 0.39467046, 0.31289502, 0.40866498, 0.37044591])

In [79]:
from sklearn.model_selection import GridSearchCV

In [80]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(sparse=False))]),
                                    ['sex', 'chest_pain_type', 'restecg', 'exang',
                                     'slope', 'num_mjr_vess', 'thal']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'restbps', 'chol', 'blood_sugar',
           

In [88]:
param_grid = {'columntransformer__pipeline-2__standardscaler':
              [StandardScaler(), MinMaxScaler(), 'passthrough'],
              'columntransformer__pipeline-2__simpleimputer__strategy':
              ['mean', 'median'],
              'linearregression__fit_intercept': [True, False]
             }

In [82]:
gs = GridSearchCV(pipe, param_grid=param_grid, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)

gs.fit(X_train, y_train)

gs.best_params_

Traceback (most recent call last):
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/metaestimators.py", line 113, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)  # noqa
  File "/Users/ted/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 469, in predict
    Xt = transform.transform(Xt)
  

{'columntransformer__pipeline-2__simpleimputer__strategy': 'mean',
 'columntransformer__pipeline-2__standardscaler': StandardScaler(),
 'linearregression__fit_intercept': True}

## Conclusion of Overall Analysis

Findings:
- The way we had the pipeline originally established is the best way
- The RSME is below 50%, not sure if that's good or bad

---

## Gender Pipeline

In [85]:
#How do i get only men? How do I create age groups this way?

Xm = df[['chest_pain_type','slope','num_mjr_vess','thal', 'restbps', 'oldpeak']]
ym = df['heart_dx']

# Xaa.head()


X_train, X_test, y_train, y_test = train_test_split(Xm, ym, 
                                                    test_size=.2,
                                                    stratify=ym,
                                                    random_state=9
                                                    )
X_test.head()

#used stratify to equate those who have been diagnosed and those who haven't.

Unnamed: 0,chest_pain_type,slope,num_mjr_vess,thal,restbps,oldpeak
52,4.0,1.0,1.0,3.0,112.0,0.0
189,3.0,2.0,3.0,7.0,140.0,2.0
22,2.0,2.0,0.0,3.0,120.0,1.8
110,4.0,2.0,0.0,7.0,145.0,1.0
8,4.0,2.0,1.0,7.0,130.0,1.4
