In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
columnNames = ['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heart_rate','exang','oldpeak','slope','num_mjr_vess','thal','dx_num']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',names=columnNames)

In [3]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heart_rate,exang,oldpeak,slope,num_mjr_vess,thal,dx_num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
df.shape

(303, 14)

In [5]:
df.isna().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heart_rate     0
exang              0
oldpeak            0
slope              0
num_mjr_vess       0
thal               0
dx_num             0
dtype: int64

In [6]:
df["dx_num"].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [7]:
df

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heart_rate,exang,oldpeak,slope,num_mjr_vess,thal,dx_num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [8]:
df = df.replace('?', np.nan)

In [9]:
df.isna().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heart_rate     0
exang              0
oldpeak            0
slope              0
num_mjr_vess       4
thal               2
dx_num             0
dtype: int64

In [10]:
df[['thal', 'sex', 'chest_pain_type', 'restecg', 'exang', 'slope','dx_num', 'blood_sugar']] = df[['thal','sex', 'chest_pain_type', 'restecg', 'exang', 'slope','dx_num', 'blood_sugar']].astype(str)
df['num_mjr_vess'] = df['num_mjr_vess'].astype(float)

In [11]:
df

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heart_rate,exang,oldpeak,slope,num_mjr_vess,thal,dx_num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [12]:
df.dtypes

age                float64
sex                 object
chest_pain_type     object
restbps            float64
chol               float64
blood_sugar         object
restecg             object
max_heart_rate     float64
exang               object
oldpeak            float64
slope               object
num_mjr_vess       float64
thal                object
dx_num              object
dtype: object

In [13]:
y = df['dx_num']
X = df.iloc[:, :-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=85)

In [14]:
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
catergorical_cols = X_train.select_dtypes(include=object).columns.tolist()

In [15]:
X_train

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heart_rate,exang,oldpeak,slope,num_mjr_vess,thal
256,67.0,0.0,4.0,106.0,223.0,0.0,0.0,142.0,0.0,0.3,1.0,2.0,3.0
102,57.0,0.0,4.0,128.0,303.0,0.0,2.0,159.0,0.0,0.0,1.0,1.0,3.0
48,65.0,0.0,3.0,140.0,417.0,1.0,2.0,157.0,0.0,0.8,1.0,1.0,3.0
215,56.0,1.0,1.0,120.0,193.0,0.0,2.0,162.0,0.0,1.9,2.0,0.0,7.0
46,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0
52,44.0,1.0,4.0,112.0,290.0,0.0,2.0,153.0,0.0,0.0,1.0,1.0,3.0
51,65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0
196,69.0,1.0,1.0,160.0,234.0,1.0,2.0,131.0,0.0,0.1,2.0,1.0,3.0


In [16]:
preprocess_cat_1 = make_pipeline(
                  SimpleImputer(),
                  OneHotEncoder(sparse=False)
                  )

preprocess_cat_2= make_pipeline(
                  SimpleImputer(strategy='most_frequent')
                  )

preprocess_num_1= make_pipeline(
                  SimpleImputer(),
                  StandardScaler()
                  )

preprocess_num_2= make_pipeline(
                  SimpleImputer(strategy='mean'),
                  StandardScaler()
                  )

In [17]:
preprocess_data = make_column_transformer(
    (preprocess_cat_1, catergorical_cols),
    (preprocess_num_1, numeric_cols),
    remainder='passthrough'
    )

In [18]:
knn = KNeighborsClassifier()
pipe_model = make_pipeline(preprocess_data, knn)

In [19]:
pipe_model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['sex', 'chest_pain_type',
                                                   'blood_sugar', 'restecg',
                                                   'exang', 'slope', 'thal']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                    

In [20]:
pipe_model.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('onehotencoder',
                                                     OneHotEncoder(sparse=False))]),
                                    ['sex', 'chest_pain_type', 'blood_sugar',
                                     'restecg', 'exang', 'slope', 'thal']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'restbps', 'chol', 'max_

In [21]:
preprocess_cat_1.get_params()

{'memory': None,
 'steps': [('simpleimputer', SimpleImputer()),
  ('onehotencoder', OneHotEncoder(sparse=False))],
 'verbose': False,
 'simpleimputer': SimpleImputer(),
 'onehotencoder': OneHotEncoder(sparse=False),
 'simpleimputer__add_indicator': False,
 'simpleimputer__copy': True,
 'simpleimputer__fill_value': None,
 'simpleimputer__missing_values': nan,
 'simpleimputer__strategy': 'mean',
 'simpleimputer__verbose': 0,
 'onehotencoder__categories': 'auto',
 'onehotencoder__drop': None,
 'onehotencoder__dtype': numpy.float64,
 'onehotencoder__handle_unknown': 'error',
 'onehotencoder__sparse': False}

In [22]:
preprocess_num_1

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler())])

In [23]:
knn_parameters = {
    "kneighborsclassifier__n_neighbors" : [1,3,5,7,9],
    "kneighborsclassifier__weights" : ["uniform", "distance"],
    "kneighborsclassifier__metric": ["manhattan", "euclidean", "minkowski"],
    "kneighborsclassifier__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "kneighborsclassifier__leaf_size": range(1,50,5)
}

In [24]:
gs = GridSearchCV(
                  estimator  =  pipe_model, 
                  param_grid =  knn_parameters, 
                  cv         =  5, 
                  scoring    =  'accuracy',
                  n_jobs     =  -1)

In [25]:
grid_res = gs.fit(X_train, y_train)



In [26]:
grid_res.best_params_

{'kneighborsclassifier__algorithm': 'auto',
 'kneighborsclassifier__leaf_size': 1,
 'kneighborsclassifier__metric': 'manhattan',
 'kneighborsclassifier__n_neighbors': 1,
 'kneighborsclassifier__weights': 'uniform'}

In [27]:
final_model = KNeighborsClassifier(
    n_neighbors = 1,
    algorithm = 'auto',
    leaf_size = 1,
    metric = 'manhattan',
    weights = 'uniform'
)

In [28]:
pipe_model = make_pipeline(preprocess_data, final_model)

In [29]:
pipe_model.fit(X_train, y_train)
y_pred = pipe_model.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred, zero_division= 1))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79        44
           1       0.22      0.33      0.26        15
           2       0.20      0.14      0.17        14
           3       0.38      0.25      0.30        12
           4       0.33      0.17      0.22         6

    accuracy                           0.52        91
   macro avg       0.38      0.34      0.35        91
weighted avg       0.51      0.52      0.51        91



In [31]:
logit=LogisticRegression()
pipe_model = make_pipeline(preprocess_data, logit)
pipe_model.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('onehotencoder',
                                                     OneHotEncoder(sparse=False))]),
                                    ['sex', 'chest_pain_type', 'blood_sugar',
                                     'restecg', 'exang', 'slope', 'thal']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'restbps', 'chol', 'max_

In [32]:
logit_parameters = {
                    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                    'logisticregression__penalty':['none', 'elasticnet', 'l1', 'l2'],
                    'logisticregression__C':[0.001, 0.01, 0.1, 1, 10, 100]
                   }

gs = GridSearchCV(
                  estimator  =  pipe_model, 
                  param_grid =  logit_parameters, 
                  cv         =  5, 
                  scoring    =  'accuracy',
                  n_jobs     =  -1)

In [33]:
grid_res = gs.fit(X_train, y_train)

270 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ali\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ali\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ali\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\ali\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 464, in _

In [34]:
grid_res.best_params_

{'logisticregression__C': 0.001,
 'logisticregression__penalty': 'none',
 'logisticregression__solver': 'newton-cg'}

In [35]:
final_model = LogisticRegression(
    penalty='none',
    C=0.001,
    solver='newton-cg'
)

In [36]:
pipe_model = make_pipeline(preprocess_data, final_model)
pipe_model.fit(X_train, y_train)
y_pred = pipe_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division= 1))

  "Setting penalty='none' will ignore the C and l1_ratio parameters"


              precision    recall  f1-score   support

           0       0.84      0.86      0.85        44
           1       0.15      0.20      0.17        15
           2       0.20      0.07      0.11        14
           3       0.26      0.42      0.32        12
           4       0.00      0.00      0.00         6

    accuracy                           0.52        91
   macro avg       0.29      0.31      0.29        91
weighted avg       0.50      0.52      0.50        91



In [37]:
from sklearn.ensemble import RandomForestClassifier
feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)