# Probability Sampling 

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

In [8]:
# Define total number of products
NUMBER_OF_PRODUCTS = 20

In [16]:
data = {"product_id": np.arange(1,NUMBER_OF_PRODUCTS+1).tolist(),
        "measure": np.round(np.random.normal(loc=10, scale=0.5, size=NUMBER_OF_PRODUCTS),3)}

In [17]:
df = pd.DataFrame(data)
real_mean = round(df["measure"].mean(),3)
df

Unnamed: 0,product_id,measure
0,1,10.415
1,2,10.831
2,3,9.823
3,4,9.763
4,5,9.76
5,6,10.808
6,7,9.875
7,8,9.867
8,9,10.362
9,10,10.568


## Simple Random Sampling

In [18]:
simple_random_sample = df.sample(n=4).sort_values(by="product_id")

simple_random_mean = round(simple_random_sample["measure"].mean(),3)

simple_random_sample

Unnamed: 0,product_id,measure
0,1,10.415
2,3,9.823
4,5,9.76
12,13,10.423


## Systematic Sampling

In [22]:
def systematic_sampling(df, step):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

systematic_sample = systematic_sampling(df, 4)

systematic_mean = round(systematic_sample["measure"].mean(), 3)

systematic_sample

Unnamed: 0,product_id,measure
0,1,10.415
4,5,9.76
8,9,10.362
12,13,10.423
16,17,9.353


## Cluster Sampling

In [26]:
def cluster_sampling(df, number_of_clusters):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)

        # Create an empty list
        indexes = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indexes.append(i)
        cluster_sample = df.iloc[indexes]
        return(cluster_sample)
    
    except:
        print("The population cannot be divided into clusters of equal size!")
        
cluster_sample = cluster_sampling(df, 2)

cluster_mean = round(cluster_sample["measure"].mean(),3)

cluster_sample

Unnamed: 0,product_id,measure,cluster_id
10,11,9.42,2
11,12,9.682,2
12,13,10.423,2
13,14,9.34,2
14,15,10.019,2
15,16,10.647,2
16,17,9.353,2
17,18,9.545,2
18,19,9.633,2
19,20,10.637,2


## Stratified Random Sampling

In [29]:
# Create data dictionary
data = {'product_id':np.arange(1, NUMBER_OF_PRODUCTS+1).tolist(),
       'product_strata':np.repeat([1,2], NUMBER_OF_PRODUCTS/2).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=NUMBER_OF_PRODUCTS),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# View data frame
df

Unnamed: 0,product_id,product_strata,measure
0,1,1,9.815
1,2,1,9.777
2,3,1,9.632
3,4,1,9.878
4,5,1,10.404
5,6,1,10.028
6,7,1,10.2
7,8,1,10.662
8,9,1,10.691
9,10,1,10.235


In [30]:
# Import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=4)

# Perform data frame split
for x, y in split.split(df, df['product_strata']):
    stratified_random_sample = df.iloc[y].sort_values(by='product_id')

# View sampled data frame
stratified_random_sample

# Obtain the sample mean for each group
stratified_random_sample.groupby('product_strata').mean().drop(['product_id'],axis=1)

Unnamed: 0_level_0,measure
product_strata,Unnamed: 1_level_1
1,10.234
2,10.205


## Measure Mean Comparison per Sampling Method

In [31]:
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean,systematic_mean,cluster_mean],
           'real_mean':real_mean}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Systematic Sampling','Cluster Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

Unnamed: 0,sample_mean,real_mean,abs_error
Systematic Sampling,10.063,10.039,0.024
Simple Random Sampling,10.105,10.039,0.066
Cluster Sampling,9.87,10.039,0.169
