In [32]:
# Cross Validation in Machine learning
# Here we will see how to build and test all the models are one time

# First we will do that by train_test_split and then by cross_val_score
# Train test split only calculates the accuracy score for only a specific test data.
# Cross val score (Cross Validation) takes into consideration all the data points and hence it is more reliable

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
a = pd.read_csv('/content/drive/MyDrive/ML Datasets/heart_disease_data.csv')
heart = pd.DataFrame(a)

In [4]:
heart

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
heart['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

In [7]:
X = heart.drop(columns='target')

In [9]:
Y = heart['target']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2, stratify=Y)

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(303, 13) (242, 13) (61, 13)


In [16]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [19]:
def compare_models_train_test_split():
  for model in models:
    model.fit(X_train, Y_train)
    X_test_prediction = model.predict(X_test)
    accuracyscore = accuracy_score(X_test_prediction, Y_test)
    print("The accuracy score of ", model, "= ", accuracyscore)

In [20]:
compare_models_train_test_split()

The accuracy score of  LogisticRegression(max_iter=1000) =  0.8032786885245902
The accuracy score of  SVC(kernel='linear') =  0.819672131147541
The accuracy score of  KNeighborsClassifier() =  0.6229508196721312
The accuracy score of  RandomForestClassifier() =  0.8032786885245902


#**Cross Validation**

In [24]:
# This is for Logistic Regression
cv_logistic = cross_val_score(LogisticRegression(max_iter=1000), X,Y, cv=5)
print(cv_logistic)

mean_number = sum(cv_logistic)/len(cv_logistic)
mean_number = mean_number*100
mean_number = round(mean_number,2)
print(mean_number)

[0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
82.83


In [25]:
# This is for Support Vector Machine
cv_logistic = cross_val_score(SVC(kernel='linear'), X,Y, cv=5)
print(cv_logistic)

mean_number = sum(cv_logistic)/len(cv_logistic)
mean_number = mean_number*100
mean_number = round(mean_number,2)
print(mean_number)

[0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
82.83


#**Creating function so that we can calculate all the models at one time **

In [30]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

def compare_models():

  for model in models:
    cv_r = cross_val_score(model, X,Y, cv=5)
    mean_num = sum(cv_r)/len(cv_r)
    mean_num = mean_num*100
    mean_num = round(mean_num,2)
    print("The 5 accuracy score of the ", model, "=", cv_r)
    print("The total accuracy for the ", model, "=", mean_num)
    print("---------------------------------------------------------------")

In [31]:
compare_models()

The 5 accuracy score of the  LogisticRegression(max_iter=1000) = [0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
The total accuracy for the  LogisticRegression(max_iter=1000) = 82.83
---------------------------------------------------------------
The 5 accuracy score of the  SVC(kernel='linear') = [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
The total accuracy for the  SVC(kernel='linear') = 82.83
---------------------------------------------------------------
The 5 accuracy score of the  KNeighborsClassifier() = [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
The total accuracy for the  KNeighborsClassifier() = 64.39
---------------------------------------------------------------
The 5 accuracy score of the  RandomForestClassifier() = [0.83606557 0.86885246 0.80327869 0.83333333 0.78333333]
The total accuracy for the  RandomForestClassifier() = 82.5
---------------------------------------------------------------
