## sampling


In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
sns.set()

In [13]:
# https://github.com/allisonhorst/palmerpenguins
df = sns.load_dataset("penguins")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### population


In [14]:
population = df.dropna()

### simple random sampling


In [15]:
n = 10

population.sample(n)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
210,Chinstrap,Dream,50.2,18.8,202.0,3800.0,Male
136,Adelie,Dream,35.6,17.5,191.0,3175.0,Female
314,Gentoo,Biscoe,44.5,14.7,214.0,4850.0,Female
142,Adelie,Dream,32.1,15.5,188.0,3050.0,Female
81,Adelie,Torgersen,42.9,17.6,196.0,4700.0,Male
203,Chinstrap,Dream,51.4,19.0,201.0,3950.0,Male
71,Adelie,Torgersen,39.7,18.4,190.0,3900.0,Male
130,Adelie,Torgersen,38.5,17.9,190.0,3325.0,Female
57,Adelie,Biscoe,40.6,18.8,193.0,3800.0,Male
156,Chinstrap,Dream,52.7,19.8,197.0,3725.0,Male


### stratified sampling


In [16]:
column_stratified = "species"

population.groupby(column_stratified, group_keys=False).apply(
    lambda x: x.sample(frac=0.04)
)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
143,Adelie,Dream,40.7,17.0,190.0,3725.0,Male
132,Adelie,Dream,36.8,18.5,193.0,3500.0,Female
16,Adelie,Torgersen,38.7,19.0,195.0,3450.0,Female
111,Adelie,Biscoe,45.6,20.3,191.0,4600.0,Male
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,Male
24,Adelie,Biscoe,38.8,17.2,180.0,3800.0,Male
193,Chinstrap,Dream,46.2,17.5,187.0,3650.0,Female
154,Chinstrap,Dream,51.3,19.2,193.0,3650.0,Male
208,Chinstrap,Dream,45.2,16.6,191.0,3250.0,Female
248,Gentoo,Biscoe,48.2,14.3,210.0,4600.0,Female


### cluster sampling


In [17]:
column_clustered = "species"
choice_size = 1

choiced_column_values = np.random.choice(
    population[column_clustered].unique(), size=choice_size, replace=False
)
population[population[column_clustered].isin(choiced_column_values)]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
147,Adelie,Dream,36.6,18.4,184.0,3475.0,Female
148,Adelie,Dream,36.0,17.8,195.0,3450.0,Female
149,Adelie,Dream,37.8,18.1,193.0,3750.0,Male
150,Adelie,Dream,36.0,17.1,187.0,3700.0,Female


### multi-stage sampling


In [18]:
# First stage
column_first_stage_sampled = "species"
first_stage_choice_size = 2

choiced_column_values = np.random.choice(
    population[column_first_stage_sampled].unique(),
    size=first_stage_choice_size,
    replace=False,
)
first_stage_sample = population[
    population[column_first_stage_sampled].isin(choiced_column_values)
]

# Second stage
column_second_stage_sampled = "island"
second_stage_choice_size = 2
sample_size = 5

choiced_column_values = np.random.choice(
    first_stage_sample[column_second_stage_sampled].unique(),
    size=second_stage_choice_size,
    replace=False,
)
for v in choiced_column_values:
    print(f"{column_second_stage_sampled}: {v}")
    print(
        first_stage_sample[first_stage_sample[column_second_stage_sampled] == v].sample(
            sample_size
        )
    )

island: Biscoe
    species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
252  Gentoo  Biscoe            45.1           14.5              207.0   
275  Gentoo  Biscoe            45.0           15.4              220.0   
100  Adelie  Biscoe            35.0           17.9              192.0   
25   Adelie  Biscoe            35.3           18.9              187.0   
325  Gentoo  Biscoe            46.8           16.1              215.0   

     body_mass_g     sex  
252       5050.0  Female  
275       5050.0    Male  
100       3725.0  Female  
25        3800.0  Female  
325       5500.0    Male  
island: Torgersen
   species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
17  Adelie  Torgersen            42.5           20.7              197.0   
0   Adelie  Torgersen            39.1           18.7              181.0   
71  Adelie  Torgersen            39.7           18.4              190.0   
82  Adelie  Torgersen            36.7           18.8              

### systematic sampling


In [19]:
systematic_population = population.copy()
systematic_population = systematic_population.reset_index()
systematic_population = systematic_population.drop("index", axis=1)

In [20]:
selected_index = np.arange(1, len(systematic_population), 20)
systematic_population.iloc[selected_index]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
21,Adelie,Biscoe,40.6,18.6,183.0,3550.0,Male
41,Adelie,Dream,41.1,19.0,182.0,3425.0,Male
61,Adelie,Biscoe,41.1,19.1,188.0,4100.0,Male
81,Adelie,Dream,36.9,18.6,189.0,3500.0,Female
101,Adelie,Biscoe,38.2,20.0,190.0,3900.0,Male
121,Adelie,Torgersen,41.5,18.3,195.0,4300.0,Male
141,Adelie,Dream,36.6,18.4,184.0,3475.0,Female
161,Chinstrap,Dream,50.5,19.6,201.0,4050.0,Male
181,Chinstrap,Dream,47.5,16.8,199.0,3900.0,Female
