#### Evaluating ML algorithms

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
#using a training and test set split to work on evaluating our algorithms

In [4]:
file = "../datasets/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(file, names=names)

In [5]:
#getting values and splitting them into X and Y
array = data.values
X = array[:,0:8]
Y = array[:,8]

In [6]:
#doing the train test split
test_size = 0.3
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)


In [7]:
#the result value is the accuracy
#0.76 = 76%
model = LogisticRegression(solver="liblinear")
model.fit(X_train,Y_train)
result = model.score(X_test,Y_test)
result

0.7619047619047619

In [9]:
#k-fold cross-validation; here 10-fold; the mean of the values of 
#accuracy gives us the total accuracy over all the cross validations
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 77.086% (5.091%)


In [10]:
#leave one out cv, where we have our fold size be one
#mean serves the same function as above
from sklearn.model_selection import LeaveOneOut
loocv = LeaveOneOut()
results = cross_val_score(model, X,Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.823% (42.196%)


In [11]:
# repeated random train-test splits
from sklearn.model_selection import ShuffleSplit
kfold = ShuffleSplit(n_splits=10, test_size=0.3, random_state=seed)
results = cross_val_score(model,X,Y,cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.926% (1.383%)
