Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

Data Collection and Analysis

PIMA Diabetes Dataset

In [None]:
# loading the diabetes dataset to a pandas DataFrame
heart_dataset = pd.read_csv('/content/heart.csv')

In [None]:
# printing the first 5 rows of the dataset
heart_dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
# number of rows and Columns in this dataset
heart_dataset.shape

(303, 14)

In [None]:
# getting the statistical measures of the data
heart_dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:
heart_dataset['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

0 --> No Heart Disease

1 --> Heart Disease

In [None]:
heart_dataset.groupby('target').mean()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,56.601449,0.826087,0.478261,134.398551,251.086957,0.15942,0.449275,139.101449,0.550725,1.585507,1.166667,1.166667,2.543478
1,52.49697,0.563636,1.375758,129.30303,242.230303,0.139394,0.593939,158.466667,0.139394,0.58303,1.593939,0.363636,2.121212


In [None]:
# separating the data and labels
X = heart_dataset.drop(columns = 'target', axis=1)
Y = heart_dataset['target']

In [None]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  
0        0   0     1  
1        0   0     2  
2        2   0    

In [None]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64


Train Test SplitS



In [None]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Splitting into 75% train and 25% test
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=2)

# Splitting into 70% train and 30% test
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=2)

# Splitting into 60% train and 40% test
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X, Y, test_size=0.4, stratify=Y, random_state=2)

# Splitting into 90% train and 10% test
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=2)

# Splitting into 50% train and 50% test
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X, Y, test_size=0.5, stratify=Y, random_state=2)


In [None]:
print(X.shape, X_train1.shape, X_test1.shape)
print(X.shape, X_train2.shape, X_test2.shape)
print(X.shape, X_train3.shape, X_test3.shape)

(303, 13) (242, 13) (61, 13)
(303, 13) (227, 13) (76, 13)
(303, 13) (212, 13) (91, 13)


Training the Model

In [None]:
#training the all four classifiers
def result(X_train, X_test, Y_train, Y_test):
  svmmodel = svm.SVC(kernel='linear')
  svmmodel.fit(X_train, Y_train)
  logmodel = LogisticRegression(random_state=1)
  logmodel.fit(X_train, Y_train)
  dctmodel = DecisionTreeClassifier(criterion = 'entropy',random_state = 0)
  dctmodel.fit(X_train, Y_train)
  rfcmodel = RandomForestClassifier(n_estimators = 100,criterion = 'entropy',random_state = 0)
  rfcmodel.fit(X_train, Y_train)
  y_pred_svm = svmmodel.predict(X_test)
  y_pred_log=logmodel.predict(X_test)
  y_pred_dct=dctmodel.predict(X_test)
  y_pred_rfc=rfcmodel.predict(X_test)
  metrics_svm = {
      'Model': 'SVM',
      'Accuracy': accuracy_score(Y_test, y_pred_svm),
      'Precision': precision_score(Y_test, y_pred_svm),
      'Recall': recall_score(Y_test, y_pred_svm),
      'F1 Score': f1_score(Y_test, y_pred_svm),
      'ROC': roc_auc_score(Y_test, y_pred_svm)
  }

  metrics_log = {
      'Model': 'Logistic Regression',
      'Accuracy': accuracy_score(Y_test, y_pred_log),
      'Precision': precision_score(Y_test, y_pred_log),
      'Recall': recall_score(Y_test, y_pred_log),
      'F1 Score': f1_score(Y_test, y_pred_log),
      'ROC': roc_auc_score(Y_test, y_pred_log)
  }

  metrics_dct = {
      'Model': 'Decision Tree',
      'Accuracy': accuracy_score(Y_test, y_pred_dct),
      'Precision': precision_score(Y_test, y_pred_dct),
      'Recall': recall_score(Y_test, y_pred_dct),
      'F1 Score': f1_score(Y_test, y_pred_dct),
      'ROC': roc_auc_score(Y_test, y_pred_dct)
  }

  metrics_rfc = {
      'Model': 'Random Forest',
      'Accuracy': accuracy_score(Y_test, y_pred_rfc),
      'Precision': precision_score(Y_test, y_pred_rfc),
      'Recall': recall_score(Y_test, y_pred_rfc),
      'F1 Score': f1_score(Y_test, y_pred_rfc),
      'ROC': roc_auc_score(Y_test, y_pred_rfc)
  }

  model_metrics = pd.DataFrame([metrics_svm, metrics_log, metrics_dct, metrics_rfc])

  print(model_metrics)



Model Evaluation

Accuracy Score

In [None]:
result(X_train1, X_test1, Y_train1, Y_test1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score       ROC
0                  SVM  0.819672   0.805556  0.878788  0.840580  0.814394
1  Logistic Regression  0.819672   0.843750  0.818182  0.830769  0.819805
2        Decision Tree  0.655738   0.714286  0.606061  0.655738  0.660173
3        Random Forest  0.803279   0.818182  0.818182  0.818182  0.801948


In [None]:
result(X_train2, X_test2, Y_train2, Y_test2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score       ROC
0                  SVM  0.842105   0.837209  0.878049  0.857143  0.839024
1  Logistic Regression  0.842105   0.871795  0.829268  0.850000  0.843206
2        Decision Tree  0.736842   0.818182  0.658537  0.729730  0.743554
3        Random Forest  0.802632   0.825000  0.804878  0.814815  0.802439


In [None]:
result(X_train3, X_test3, Y_train3, Y_test3)

                 Model  Accuracy  Precision  Recall  F1 Score       ROC
0                  SVM  0.824176   0.840000    0.84  0.840000  0.822439
1  Logistic Regression  0.846154   0.891304    0.82  0.854167  0.849024
2        Decision Tree  0.703297   0.767442    0.66  0.709677  0.708049
3        Random Forest  0.846154   0.875000    0.84  0.857143  0.846829


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
result(X_train4, X_test4, Y_train4, Y_test4)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score       ROC
0                  SVM  0.827869   0.826087  0.863636  0.844444  0.824675
1  Logistic Regression  0.836066   0.838235  0.863636  0.850746  0.833604
2        Decision Tree  0.688525   0.741379  0.651515  0.693548  0.691829
3        Random Forest  0.836066   0.828571  0.878788  0.852941  0.832251


In [None]:
result(X_train5, X_test5, Y_train5, Y_test5)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score       ROC
0                  SVM  0.774194   0.777778  0.823529  0.800000  0.768908
1  Logistic Regression  0.774194   0.812500  0.764706  0.787879  0.775210
2        Decision Tree  0.645161   0.714286  0.588235  0.645161  0.651261
3        Random Forest  0.741935   0.800000  0.705882  0.750000  0.745798


In [None]:
result(X_train6, X_test6, Y_train6, Y_test6)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score       ROC
0                  SVM  0.848684   0.857143  0.867470  0.862275  0.846778
1  Logistic Regression  0.835526   0.845238  0.855422  0.850299  0.833508
2        Decision Tree  0.750000   0.835821  0.674699  0.746667  0.757639
3        Random Forest  0.796053   0.833333  0.783133  0.807453  0.797363
