# Model Validation Methods

#### 1.Evaluate using a train and a test set

In [None]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 

filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)


In [None]:
result

0.7874015748031497

In [None]:
result*100.0

78.74015748031496

In [None]:
dataframe


#### 2.Evaluate using K-Fold Cross Validation

In [None]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [None]:
filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

In [None]:
dataframe

In [None]:
array = dataframe.values

# Split data in train and test data
X = array[:,0:8]
Y = array[:,8]

num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds)
model = LogisticRegression(max_iter=200)
results = cross_val_score(model, X, Y, cv=kfold)

In [None]:
results # accuracy of 10 models

In [None]:
results.mean()*100.0 # Final accuracy is the mean of all accuracies

77.60423786739577

In [None]:
results.std()*100.0 
# + or - 5% standard deviation for accuracy. If Std is very high means models are very inconsistent for this dataset

5.157545262086822

#### 3.Evaluate using Leave One Out Cross Validation

In [None]:
# Evaluate using Leave One Out Cross Validation
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, X, Y, cv=loocv)

In [None]:
results

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [None]:
results.mean()

0.7760416666666666

In [None]:
X.shape

(768, 8)

In [None]:
results.mean()*100.0

77.60416666666666

In [None]:
results.std()*100.0 
# Here accuracy is either 0% or 100% so we are getting high std.
# So don't consider Std. here

41.68944689773287

In [None]:
results # 1 indicates 100 % accuracy and 0 indicates 0 % accuracy here

In [None]:
import numpy as np
np.array([100,100,100,0,0]).std() # check std of values 100 and 0

48.98979485566356