# Spliting and Sampling Strategies


In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import random
from sklearn import linear_model, datasets, utils

In [4]:
data = datasets.load_iris()
df = pd.DataFrame(data= np.c_[data['data'], data['target']], columns= data['feature_names'] + ['target'])

In [5]:
def kFoldCrossValidation(data, k = 10, seed = 50):
    fold_size = round(len(data) / k)
    kfolds = []
    data = utils.shuffle(data, random_state=seed)
    data.reset_index(inplace=True, drop=True)

    subsets = [data.iloc[x:x+fold_size,] for x in range(0, len(data), fold_size)]

    if len(data) > k*fold_size:
        subsets[k-1].append(data.iloc[k*fold_size-1:,], ignore_index=True)
    
    for index in range(k):
        test = subsets[index]
        train = pd.DataFrame()
        for idx in range(k):
            if idx != index:
                train = pd.concat([train, subsets[idx]])
        # yield train, test        
        kfolds.append((train, test))
    return kfolds    

In [16]:
split = kFoldCrossValidation(df, k=10)
k_fold_train, k_fold_test = split[0][0], split[0][1]
k_fold_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
15,5.0,3.0,1.6,0.2,0.0
16,5.8,2.7,3.9,1.2,1.0
17,5.3,3.7,1.5,0.2,0.0
18,5.5,2.4,3.7,1.0,1.0
19,6.7,3.0,5.0,1.7,1.0
...,...,...,...,...,...
145,5.9,3.2,4.8,1.8,1.0
146,6.4,2.8,5.6,2.2,2.0
147,5.5,4.2,1.4,0.2,0.0
148,7.2,3.6,6.1,2.5,2.0


In [17]:
k_fold_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.6,3.0,4.1,1.3,1.0
1,6.3,2.5,4.9,1.5,1.0
2,5.4,3.4,1.7,0.2,0.0
3,5.4,3.9,1.3,0.4,0.0
4,6.5,3.0,5.2,2.0,2.0
5,6.7,3.1,5.6,2.4,2.0
6,5.7,2.5,5.0,2.0,2.0
7,5.1,3.3,1.7,0.5,0.0
8,4.8,3.0,1.4,0.1,0.0
9,6.2,2.2,4.5,1.5,1.0


In [7]:
def holdOutCrossValidation(data, train=70, seed=50):
    data = utils.shuffle(data, random_state=seed)
    data.reset_index(inplace=True, drop=True)
    train_index = round((len(data)*train)/100)
    train = data.iloc[0:train_index-1,]
    test = data.iloc[train_index:, ]
    return train, test

In [11]:
train, test = holdOutCrossValidation(df)
train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.6,3.0,4.1,1.3,1.0
1,6.3,2.5,4.9,1.5,1.0
2,5.4,3.4,1.7,0.2,0.0
3,5.4,3.9,1.3,0.4,0.0
4,6.5,3.0,5.2,2.0,2.0
...,...,...,...,...,...
99,5.2,3.4,1.4,0.2,0.0
100,6.0,3.0,4.8,1.8,2.0
101,5.9,3.0,5.1,1.8,2.0
102,6.9,3.2,5.7,2.3,2.0


In [13]:
test.head(20)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
105,6.2,3.4,5.4,2.3,2.0
106,5.6,2.7,4.2,1.3,1.0
107,6.7,2.5,5.8,1.8,2.0
108,5.0,2.3,3.3,1.0,1.0
109,5.1,3.5,1.4,0.2,0.0
110,6.4,3.2,4.5,1.5,1.0
111,6.5,3.2,5.1,2.0,2.0
112,5.4,3.7,1.5,0.2,0.0
113,6.2,2.8,4.8,1.8,2.0
114,5.8,2.7,4.1,1.0,1.0


In [54]:
def stratifiedSampling(data, separator, repo=True, sample_size=70, seed=50, random=True):
    stratified_data = []
    np.random.seed(seed)
    if random == True:
        data = utils.shuffle(data, random_state=seed)
        data.reset_index(inplace=True, drop=True)
    sampling_size = round((len(data)*70)/100)
    sampling = pd.DataFrame()

    for value in data[separator].unique():
        stratified_data.append(data[data[separator] == value])
    
    for sample in stratified_data:
        sample_percent = (len(sample)*100)/len(data)
        sample_size = round((sampling_size*sample_percent)/100)
        for index in range(sample_size):
            idx = np.random.randint(low=0, high=len(sample)-1) if random == True else index
            sampling = sampling.append(sample.iloc[idx,])#  = pd.concat([sampling, sample.iloc[idx,]])
            if repo == False:
                sample.drop([sample.index[idx]])
    sampling.reset_index(inplace=True, drop=True)
    return sampling

In [55]:
split = stratifiedSampling(df, 'target', repo=False)
split

Unnamed: 0,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm),target
0,4.2,1.2,5.7,3.0,1.0
1,4.6,1.3,6.6,2.9,1.0
2,4.3,1.3,6.2,2.9,1.0
3,4.6,1.4,6.1,3.0,1.0
4,4.0,1.0,6.0,2.2,1.0
...,...,...,...,...,...
100,5.8,1.8,6.7,2.5,2.0
101,4.9,2.0,5.6,2.8,2.0
102,4.9,1.8,6.3,2.7,2.0
103,5.1,1.9,5.8,2.7,2.0


In [47]:
def simpleRandomSampling(data, repo=True, seed=50, sample_size=70):
    np.random.seed(seed)
    data = utils.shuffle(data, random_state=seed)
    data.reset_index(inplace=True, drop=True)
    df = data
    sample_size = round((len(data)*70)/100)
    sample = pd.DataFrame()
    
    for index in range(sample_size):
        idx = np.random.randint(low=0, high=len(df)-1)
        sample = sample.append(df.iloc[idx,])#  = pd.concat([sampling, sample.iloc[idx,]])
        if repo == False:
            df.drop([idx])
    
    sample.reset_index(inplace=True, drop=True)
    return sample

In [50]:
sample = randomSimpleStratifiedSampling(df, repo=False)
sample

Unnamed: 0,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm),target
0,4.0,1.3,6.1,2.8,1.0
1,1.4,0.2,5.1,3.5,0.0
2,5.7,2.5,6.7,3.3,2.0
3,1.4,0.2,5.0,3.3,0.0
4,6.3,1.8,7.3,2.9,2.0
...,...,...,...,...,...
100,1.2,0.2,5.8,4.0,0.0
101,4.1,1.3,5.6,3.0,1.0
102,4.7,1.4,7.0,3.2,1.0
103,1.4,0.2,4.6,3.2,0.0


In [55]:
def systematicSampling(data, size, seed=50):
    N = len(data)
    n = size
    k = round(N/n)
    r = np.random.randint(low=0, high=k-1)
    data = utils.shuffle(data, random_state=seed)
    data.reset_index(inplace=True, drop=True)
    np.random.seed(seed)
    sample = pd.DataFrame()

    for ind in range(0, N, k):
        sample = sample.append(data.iloc[ind,])
    
    sample.reset_index(inplace=True, drop=True)
    return sample 

In [56]:
sample = systematicSampling(df, 30)
sample

Unnamed: 0,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm),target
0,4.1,1.3,5.6,3.0,1.0
1,5.6,2.4,6.7,3.1,2.0
2,1.5,0.2,5.1,3.4,0.0
3,1.6,0.2,5.0,3.0,0.0
4,6.0,2.5,6.3,3.3,2.0
5,6.9,2.3,7.7,2.6,2.0
6,6.0,1.8,7.2,3.2,2.0
7,1.5,0.2,4.6,3.1,0.0
8,1.3,0.3,4.5,2.3,0.0
9,1.4,0.3,5.1,3.5,0.0


## Convenience Sampling

* Convenince Sampling, also know avalilability sampling, is a specific type of non-probability sampling method that relies on data collection from propulation members who are conveniently available to participate in study or experiment.

* Convenience Sampling is a type of sampling where the first available primary data source will be esed for the research without any additional requirements;

* The best way to reduce the bias of convenience sampling is to use that along other statistical sampling method;