In [56]:
import numpy as np
import pandas as pd

In [57]:
heart = pd.read_csv('/content/heart.csv')

In [58]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [59]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


We can see that there are numerous independent variables and one dependent variable i.e target.

In [60]:
# Exploring the target variable

# We can note here that the classes are balanced and the target variable is ready for modelling

heart['target'].value_counts()

1    526
0    499
Name: target, dtype: int64

In [61]:
# We have no missing values to worry about
heart.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [62]:
# Splitting into X and y

X=heart.iloc[:,:-1]
y=heart['target']

## Splitting into train and test

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state =23)

In [65]:
# Let's go ahead and fit an SVC Classifier

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC


# We use a baseline model of LinearSVC and observe a score of 85.3%( may vary based on random seed)

In [66]:
svm_clf= Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge')),
])

In [67]:
svm_clf.fit(X_train,y_train)



Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [68]:
svm_clf.score(X_test,y_test)

0.8536585365853658

In [69]:
# We shall try the same with a higher hyperparameter

# c=10

svm_clf= Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=10, loss='hinge')),
])

In [70]:
svm_clf.fit(X_train,y_train)



Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=10, loss='hinge'))])

In [71]:
# We see a relative improvement in the score
svm_clf.score(X_test,y_test)

0.8536585365853658

In [72]:
y_train.value_counts()

1    421
0    399
Name: target, dtype: int64

# We shall implement the Random Forest Algorithm

In [73]:
from sklearn.ensemble import RandomForestClassifier


In [74]:
RF_clf = RandomForestClassifier(n_estimators=2)

In [75]:
RF_clf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=2)

In [76]:
RF_clf.score(X_test,y_test)

0.9707317073170731

In [77]:
RF_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 2,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [78]:
y_preds = RF_clf.predict(X_test)
y_preds

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0])

In [79]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [80]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       100
           1       1.00      0.94      0.97       105

    accuracy                           0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205



In [81]:
confusion_matrix(y_test,y_preds)

array([[100,   0],
       [  6,  99]])

In [82]:
# This value is the same as the default value that is output by the model.score()
accuracy_score(y_test,y_preds)

0.9707317073170731

# Trying to improve the model

In [83]:
np.random.seed(42)

for i in range(1,5,1):
  print(f'Trying model with{i} estimators..')
  clf=RandomForestClassifier(n_estimators=i).fit(X_train,y_train)
  print(f'Model accuracy on the test set would be: ', clf.score(X_train,y_train))

Trying model with1 estimators..
Model accuracy on the test set would be:  0.9926829268292683
Trying model with2 estimators..
Model accuracy on the test set would be:  0.9963414634146341
Trying model with3 estimators..
Model accuracy on the test set would be:  0.9878048780487805
Trying model with4 estimators..
Model accuracy on the test set would be:  0.9975609756097561


# We achieve the best result with 4 estimators and achieve an accuracy of 99.75% on the Test data