# Different Sampling Methods 
This notebook has two analyses:
1. In the first analysis the objectiveis to illustrate the benefits of Cross Validation over Hold Out. 
We show how X Val estimates of accuracy have lower variance that Hold Out. This is done by repeating each analysis 100 times and plotting the results. 
2. In the second analysis, using bootstrap sampling, we show how accuracy estimation varies with training set size. 

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
dtree = DecisionTreeClassifier(criterion='entropy')

In [None]:
bcDB = datasets.load_breast_cancer()
y = bcDB.target
X = bcDB.data

## Hold-Out Testing

In [None]:
reps = 100
ho = []
for i in range(reps):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i)
    y_pred = dtree.fit(X_train, y_train).predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    ho.append(acc)

## X Val

In [None]:
xv = []
for i in range(reps):
    kf = KFold(n_splits=10, shuffle = True) # needed to ensure shuffling
    scores = cross_val_score(dtree, X, y, cv=kf)
    xv.append(scores.mean())

In [None]:
res = pd.DataFrame(ho, columns = ['Hold Out'])
res['X Val']=xv
%matplotlib inline
ax = res.plot()
ax.set_xlabel("Iteration")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy: Hold Out and X Val")

## Learning Curve
In the second analysis we show how accuracy estimation varies with training set size.   
We switch to the wine dataset because the effect is more evident there because it is a smaller dataset. 

In [None]:
wineDB = datasets.load_wine()
y = wineDB.target
X = wineDB.data
X.shape

In [None]:
from statistics import mean
reps=300
num = 30
ho_s = []
s_s = []
for i in range(1,num):
    s = i/num
    for j in range(reps):
        ss =[]
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = s)
        y_pred = dtree.fit(X_train, y_train).predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        ss.append(acc)
    ho_s.append(mean(ss))
    s_s.append(s)

In [None]:
ho_s = pd.DataFrame(ho_s, index = s_s, columns = ['Hold Out'])
ax = ho_s.plot()
ax.set_xlabel("Train Set Proportion")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy: Train Set Size")