In [60]:
import sklearn
sklearn.__version__

'1.5.1'

In [61]:
# EDA and Plotting Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Helps to view images in the notebook
%matplotlib inline 

# Models From SciKit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluation Metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay 


In [62]:
df = pd.read_csv("./heart-disease.csv")

## DATA EXPLORATION
1. What question(s) are you trying to solve ?
2. What kind of data do we have and how do we treat different types?
3. What's missing from the data and how do you deal with it?
4. What are the outliers and why should you care about them?
5. How can you add, change or remove features to get more out of your data?

In [63]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [64]:
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [65]:
#Check Imabalance or Balance Data. Let's find out how many of each class there...
df['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

In [66]:
df['target'].value_counts().plot(kind="bar", color=["salmon","lightblue"]);

In [67]:
# Check Missing Values
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [68]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [69]:
# Check Maths 
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [75]:
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(5,10))
ax = sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="YlGnBu")


In [76]:
# 5.Modelling
X = df.drop("target", axis=1)
Y = df["target"]

In [78]:
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((212, 13), (91, 13), (212,), (91,))

In [79]:
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier()
    }


def  fit_and_score(models, x_train, x_test, y_train, y_test):
  
  np.random.seed(42)

  model_scores = {}

  for name, model in models.items():
    
    model.fit(x_train, y_train)
    
    model_scores[name] = model.score(x_test, y_test)

  return model_scores

In [80]:
model_scores = fit_and_score(models=models, x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test)
model_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': 0.8131868131868132,
 'KNN': 0.6593406593406593,
 'Random Forest': 0.8241758241758241}

In [84]:
# Model Comparision

model_compare = pd.DataFrame(model_scores, index=["accuracy"])

# 
# model_compare,
model_compare.T.plot.bar()

<Axes: >

In [None]:
# Now we have base models, these are not final ones... we need to improve
# Lets look at the following:

# * Hyperparmaeter tuning
# * Feature importance
# * Confusion matrix 
# * Cross-validation
# * Precision 
# * Recall 
# * F1 score 
# * Classification report 
# * ROC curve 
# * Area under the curve (AUC)

# HYPERPARAMETER TUNING

# knn

train_scores = []
test_scores = []

# Create a list of different values for n_neighbours
neighbours = range(1,21)

# Setup KNN instance 
knn = KNeighborsClassifier()

# Loop through different n_neighbors
for i in neighbours:
    knn.set_params(n_neighbors=i)

    knn.fit(x_train, y_train)

    # Train Scores 
    train_scores.append(knn.score(x_train, y_train))

    # Test Scores 
    test_scores.append(knn.score(x_test, y_test))


train_scores,

test_scores



[0.5934065934065934,
 0.6153846153846154,
 0.6263736263736264,
 0.6373626373626373,
 0.6593406593406593,
 0.6483516483516484,
 0.6703296703296703,
 0.6593406593406593,
 0.6593406593406593,
 0.6923076923076923,
 0.6703296703296703,
 0.6593406593406593,
 0.6703296703296703,
 0.6153846153846154,
 0.6483516483516484,
 0.6593406593406593,
 0.6813186813186813,
 0.6703296703296703,
 0.6703296703296703,
 0.6813186813186813]

In [93]:
plt.plot(neighbours, train_scores, label="Train score")
plt.plot(neighbours, test_scores, label="Test score")
plt.xticks(np.arange(1,21,1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Max KNN score on the test data: {max(test_scores) * 100:.2f}%")

Max KNN score on the test data: 69.23%


In [100]:
# HYPERTUINING WITH RANDOMIZEDSEARCHCV 

# LogisticRegression
LOG_REGI_GRID = {
    "C": np.logspace(-4, 4, 20),
    "solver":["liblinear"]
    }
# RandomForestClassifier
rf_grid = {
    "n_estimators": np.arange(10,1000,50),
    "max_depth":[None, 3,5,10],
    "min_samples_split": np.arange(2,20,2),
    "min_samples_leaf": np.arange(1,20,2)
    }

In [97]:
# RandomizedSearchCV
np.random.seed(43)

rs_log_reg = RandomizedSearchCV(
    LogisticRegression(),
    param_distributions=LOG_REGI_GRID,
    cv=5,
    n_iter=20,
    verbose=True
)

rs_log_reg.fit(x_train, y_train)

rs_log_reg.best_params_


Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'solver': 'liblinear', 'C': 0.08858667904100823}

In [98]:
rs_log_reg.score(x_test, y_test)

0.8461538461538461

In [102]:
np.random.seed(42)

rs_rf = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=rf_grid,
    cv = 5,
    n_iter=2, # No of combination of different parameters
    verbose= True
)

rs_rf.fit(x_train, y_train)

rs_rf.best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


{'n_estimators': 510,
 'min_samples_split': 18,
 'min_samples_leaf': 19,
 'max_depth': 5}

In [103]:
rs_rf.score(x_test, y_test)

0.8351648351648352

In [104]:
model_scores

{'Logistic Regression': 0.8131868131868132,
 'KNN': 0.6593406593406593,
 'Random Forest': 0.8241758241758241}

In [110]:
#  By hand KNn no 
# after random,
# now log is king, we can check by the GridSearchCV

log_reg_grid = {
    "C": np.logspace(-4,4,20),
    "solver":["liblinear"]
    }

gs_log_reg = GridSearchCV(  # no iteratinos s ince GridSearch will check all combinations
    LogisticRegression(),
    param_grid= log_reg_grid,
    cv=5,
    verbose=True                      
                          )

gs_log_reg.fit(x_train, y_train)
gs_log_reg.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'C': 0.08858667904100823, 'solver': 'liblinear'}

In [111]:
gs_log_reg.score(x_test, y_test)

0.8461538461538461

In [113]:
# EVALUATE MODEL 
# ROC curve and AUC score
# Confusin matrix 
# Classification report 
# Precision 
# Recall 
# F1-score 

y_predcs = gs_log_reg.predict(x_test)

y_predcs

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1], dtype=int64)

In [114]:
y_test

179    0
228    0
111    1
246    0
60     1
      ..
250    0
19     1
143    1
79     1
144    1
Name: target, Length: 91, dtype: int64

In [115]:
RocCurveDisplay(gs_log_reg, x_test, y_test)

TypeError: RocCurveDisplay.__init__() takes 1 positional argument but 4 were given

In [118]:
print(confusion_matrix(y_test, y_predcs))

[[34  7]
 [ 7 43]]


In [121]:
# sns.set(s=1.)

def plot_conf_mat(y_test, y_preds):
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)

plot_conf_mat(y_test, y_predcs)

In [122]:
print(classification_report(y_test, y_predcs))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        41
           1       0.86      0.86      0.86        50

    accuracy                           0.85        91
   macro avg       0.84      0.84      0.84        91
weighted avg       0.85      0.85      0.85        91



In [123]:
# cross val score 

gs_log_reg.best_params_

{'C': 0.08858667904100823, 'solver': 'liblinear'}

In [124]:
clf = LogisticRegression(C=0.088, solver="liblinear")


In [126]:
# Cross-validated accuracy
cv_acc = cross_val_score(clf, X, Y, cv=5, scoring="accuracy" )
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.86666667, 0.75      ])

In [128]:
cv_acc = np.mean(cv_acc)

cv_acc

0.834808743169399

In [136]:
# PREICSION AUC
cv_precision = cross_val_score(clf, X, Y, cv=5, scoring="precision")
cv_precision = np.mean(cv_precision)
cv_precision

0.8182683982683983

In [137]:
# PREICSION Recall
cv_recall = cross_val_score(clf, X, Y, cv=5, scoring="recall")
cv_recall = np.mean(cv_recall)
cv_recall

0.9030303030303031

In [134]:
# PREICSION Recall
cv_f1 = cross_val_score(clf, X, Y, cv=5, scoring="f1")
cv_f1 = np.mean(cv_f1)
cv_f1

0.8572876223964055

In [139]:
cv_metrics = pd.DataFrame({
    "Accuracy": cv_acc,
    "Preicision":cv_precision,
    "Recall": cv_recall,
    "F1": cv_f1
},index=[0])

cv_metrics.T.plot.bar(title="Cross-validated classification metrics")

<Axes: title={'center': 'Cross-validated classification metrics'}>

In [142]:
# FEATURE IMPORTANCE 
gs_log_reg.best_params_
clf= LogisticRegression(C=0.08, solver="liblinear")
clf.fit(x_train, y_train);

In [143]:
clf.coef_

array([[-0.00224045, -0.45731978,  0.53586369, -0.00662923, -0.00130361,
         0.07083533,  0.17606734,  0.01994758, -0.42030206, -0.42986912,
         0.32533544, -0.61330349, -0.58817811]])

In [None]:
feature_dict = dict(zip(df.columns, list(clf.coef_[0])))
feature_dict

{'age': -0.0022404491567117823,
 'sex': -0.4573197814633943,
 'cp': 0.5358636886270007,
 'trestbps': -0.006629226318991506,
 'chol': -0.0013036131922819906,
 'fbs': 0.0708353265915484,
 'restecg': 0.17606734301000335,
 'thalach': 0.019947576397343333,
 'exang': -0.4203020572955751,
 'oldpeak': -0.42986911818786655,
 'slope': 0.3253354416121168,
 'ca': -0.6133034896844182,
 'thal': -0.5881781053317633}

In [146]:
# Visualize feature importance  => Here we are doing Feature importance after building model
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="Feature Importance")

<Axes: title={'center': 'Feature Importance'}>