## Cross Validation Techniques


### Dataset used in Heart Attack Analysis & Prediction Dataset (A dataset for heart attack classification) can be downloaded from

https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset
    

In [38]:
# Cross validation is a method of estimating expected prediction error.

# “Cross-Validation in machine learning is a technique that is used to train and evaluate our model on a portion of our database, 
# before re-portioning our dataset and evaluating it on the new portions.”
# In simpler words instead of splitting our dataset into 2 different parts (one for training and other for testing),
# Split your dataset into multiple portions, use some of them for training and rest for testing

In [3]:
import pandas as pd

### Load the data

In [8]:
df=pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
### Independent and dependent features

X = df.iloc[:, : -1]
y = df.iloc[:, -1]

In [10]:
X.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [11]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: output, Length: 303, dtype: int64

In [12]:
y.value_counts()

# 1 represents heart attack.
# 0 represents no heart attack.

1    165
0    138
Name: output, dtype: int64

### Hold Out Validation Approach- Train And Test Split

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 4)

model = DecisionTreeClassifier()

model.fit(X_train,  y_train)
result = model.score(X_test, y_test)

print(result)

0.8032786885245902


### K-Fold Cross Validation


In [25]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import cross_val_score


In [26]:
model = DecisionTreeClassifier()
kfold_validation = KFold(10)

results = cross_val_score(model, X, y, cv = kfold_validation)
print(results)
print(np.mean(results)*100)
print(np.std(results)*100)

[0.74193548 0.93548387 0.77419355 0.63333333 0.66666667 0.86666667
 0.83333333 0.83333333 0.56666667 0.53333333]
73.8494623655914
12.741350197528234


### Stratified K-fold Cross Validation

In [27]:
from sklearn.model_selection import StratifiedKFold

In [28]:
skfold = StratifiedKFold(n_splits=5)
model = DecisionTreeClassifier()
scores = cross_val_score(model, X, y, cv = skfold)
print(np.mean(results)*100)
print(np.std(results)*100)

73.8494623655914
12.741350197528234


In [29]:
scores

array([0.75409836, 0.80327869, 0.73770492, 0.75      , 0.73333333])

### Leave One Out Cross Validation

In [30]:
from sklearn.model_selection import LeaveOneOut

In [31]:
model = DecisionTreeClassifier()
leave_validation = LeaveOneOut()
results = cross_val_score(model, X, y, cv = leave_validation)
results

array([1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1.,
       1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [32]:
print(np.mean(results) * 100)
print(np.std(results) * 100)

79.20792079207921
40.58198323152316


### Repeated Random Train_test_set Splits

In [33]:
from sklearn.model_selection import ShuffleSplit

In [34]:
model = DecisionTreeClassifier()
ssplit = ShuffleSplit(n_splits = 10, test_size = 0.30)
results = cross_val_score(model, X, y, cv = ssplit)

In [35]:
results

array([0.76923077, 0.71428571, 0.71428571, 0.73626374, 0.75824176,
       0.73626374, 0.76923077, 0.68131868, 0.75824176, 0.76923077])

In [36]:
print(np.mean(results) * 100)
print(np.std(results) * 100)

74.06593406593407
2.8316700498296976
