# Resampling [link](https://machinelearningmastery.com/a-gentle-introduction-to-the-bootstrap-method/)

In [17]:
# create conda env
!conda-env remove -n resampling -y
!conda create --name resampling --yes --force python=3.6 scikit-learn pandas matplotlib numpy


EnvironmentLocationNotFound: Not a conda environment: /opt/Anaconda3-5.3.0-Linux-x86_64/envs/resampling

Solving environment: done

## Package Plan ##

  environment location: /opt/Anaconda3-5.3.0-Linux-x86_64/envs/resampling

  added / updated specs: 
    - matplotlib
    - numpy
    - pandas
    - python=3.6
    - scikit-learn


The following NEW packages will be INSTALLED:

    blas:             1.0-mkl           
    certifi:          2016.2.28-py36_0  
    cycler:           0.10.0-py36_0     
    dbus:             1.10.20-0         
    expat:            2.1.0-0           
    fontconfig:       2.12.1-3          
    freetype:         2.5.5-2           
    glib:             2.50.2-1          
    gst-plugins-base: 1.8.0-0           
    gstreamer:        1.8.0-0           
    icu:              54.1-0            
    jpeg:             9b-0              
    libffi:           3.2.1-1           
    libgcc:           5.2.0-0           
    libgfortran:      3.0.0-1           
    

In [12]:
#Simple Example
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample
data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# prepare bootstrap sample
boot = resample(data, replace=True, n_samples=4, random_state=1)
print('Bootstrap Sample: %s' % boot)
# out of bag observations
oob = [x for x in data if x not in boot]
print('OOB Sample: %s' % oob)

Bootstrap Sample: [0.6, 0.4, 0.5, 0.1]
OOB Sample: [0.2, 0.3]


In [None]:
#Real Example

import numpy
from pandas import read_csv
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

# load dataset
data = read_csv('pima-indians-diabetes.data.csv', header=None)
values = data.values

# configure bootstrap
n_iterations = 100
n_size = int(len(data) * 0.50)

# run bootstrap
stats = list()
for i in range(n_iterations):
    # prepare train and test sets
    train = resample(values, n_samples=n_size)
    test = numpy.array([x for x in values if x.tolist() not in train.tolist()])
    # fit model
    model = DecisionTreeClassifier()
    model.fit(train[:,:-1], train[:,-1])
    # evaluate model
    predictions = model.predict(test[:,:-1])
    score = accuracy_score(test[:,-1], predictions)
    print(score)
    stats.append(score)
    
# plot scores
pyplot.hist(stats)
pyplot.show()

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, numpy.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, numpy.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

0.7025316455696202
0.7167381974248928
0.6716417910447762
0.6866952789699571
0.678646934460888
0.7029914529914529
0.691304347826087
0.721868365180467
0.693304535637149
0.683083511777302
0.6832971800433839
0.6791208791208792
0.665948275862069
0.7025862068965517
0.6980728051391863
0.6457883369330454
0.7100840336134454
0.6701680672268907
0.7219827586206896
0.6815286624203821
0.6923076923076923
0.7118279569892473
0.7230769230769231
0.6857749469214437
0.6784968684759917
0.6752688172043011
0.6962025316455697
0.7136659436008677
0.7105831533477321
0.7004310344827587
0.6884531590413944
0.6296296296296297
0.6907894736842105
0.6681127982646421
0.6934782608695652
0.6557734204793029
0.6828752642706131
0.7446351931330472
0.6602564102564102
0.7084233261339092
0.7044967880085653
0.6885593220338984
0.652267818574514
0.676923076923077
0.6452991452991453
0.7058823529411765
0.6594360086767896
0.6717391304347826
0.6645021645021645
0.7084233261339092
0.7046413502109705
0.6702586206896551
0.6768558951965066
0

In [16]:
!conda-env remove -n resampling -y


Remove all packages in environment /opt/Anaconda3-5.3.0-Linux-x86_64/envs/resampling:


## Package Plan ##

  environment location: /opt/Anaconda3-5.3.0-Linux-x86_64/envs/resampling


The following packages will be REMOVED:

    blas:             1.0-mkl           
    certifi:          2016.2.28-py36_0  
    cycler:           0.10.0-py36_0     
    dbus:             1.10.20-0         
    expat:            2.1.0-0           
    fontconfig:       2.12.1-3          
    freetype:         2.5.5-2           
    glib:             2.50.2-1          
    gst-plugins-base: 1.8.0-0           
    gstreamer:        1.8.0-0           
    icu:              54.1-0            
    jpeg:             9b-0              
    libffi:           3.2.1-1           
    libgcc:           5.2.0-0           
    libgfortran:      3.0.0-1           
    libiconv:         1.14-0            
    libpng:           1.6.30-1          
    libxcb:           1.12-1            
    libxml2:          2.9.4-0       