# Sampling

In [5]:
import random
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
#Set a random seed for each of the modules doing random selection
np.random.seed(1663)
random.seed(1663)

In [11]:
#Simple random sampling - Select items from a list at random
population = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

#Randomly select five items from the population
sample = random.sample(population, 5) 

print("Population:", population)
print("Simple Random Sample:", sample)

Population: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Simple Random Sample: [3, 6, 4, 2, 1]


In [13]:
#Systematic Sampling - Every n'th item after a random starting point
population = np.arange(1, 101)

#Systematic sampling with a sampling interval of 10
start = np.random.randint(1, 11)  #Random starting point
interval = 10
sample = population[start-1::interval] #Stop omitted, so this runs through the end of the list

print("Population:", population)
print("Systematic Sample:", sample)

Population: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100]
Systematic Sample: [ 7 17 27 37 47 57 67 77 87 97]


In [15]:
#Stratified sampling - Group population based on attributes and sample equally from each group

data = [(1, 'A'), (2, 'A'), (3, 'A'), (4, 'B'), (5, 'B'), (6, 'B'), (7, 'C'), (8, 'C'), (9, 'C')]

#Divide data into strata based on the second element of each pair
strata = {}
for i in data:
    if i[1] not in strata:
        strata[i[1]] = []
    strata[i[1]].append(i[0])
print(strata)

#Sample the strata
sample = []
for key in strata.keys():
    stratum_sample = random.sample(strata[key], 2)  #Sample 2 elements from each stratum
    sample.extend([(x, key) for x in stratum_sample])

print("Population:", data)
print("Stratified Sample:", sample)

{'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
Population: [(1, 'A'), (2, 'A'), (3, 'A'), (4, 'B'), (5, 'B'), (6, 'B'), (7, 'C'), (8, 'C'), (9, 'C')]
Stratified Sample: [(2, 'A'), (1, 'A'), (5, 'B'), (6, 'B'), (9, 'C'), (8, 'C')]


In [17]:
#Population divided into clusters
clusters = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]

#Randomly select clusters
selected_clusters = random.sample(clusters, 2)

#Sample all elements from selected clusters
sample = [element for cluster in selected_clusters for element in cluster]

print("Population Clusters:", clusters)
print("Selected Clusters:", selected_clusters)
print("Cluster Sample:", sample)

Population Clusters: [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
Selected Clusters: [[1, 2, 3], [7, 8, 9]]
Cluster Sample: [1, 2, 3, 7, 8, 9]


In [50]:
#Probability based sampling - Sample from groups proportionate to their contribution to the total

students = {
    "School A": [("Amanda", 85), ("Beth", 75), ("Chelsie", 90), ("Will", 86), ("Nick", 79), ("Cole", 80), ("Melissa", 99), ("Jessica", 68)],
    "School B": [("Jen", 95), ("Jillian", 65), ("Sam", 92)],
    "School C": [("Laura", 82), ("Megan", 91), ("Nicki", 87), ("Noah", 84), ("Tiffany", 93)],
    "School D": [("Marissa", 77), ("Toni", 89), ("Katie", 83), ("Brittany", 94)]
}

#Calculate the total number of students
total_students = sum(len(students[school]) for school in students)

#Calculate counts and percentages for each school
school_counts = {school: len(students[school]) for school in students}
school_percentages = {school: (count / total_students) * 100 for school, count in school_counts.items()}

#Print counts and percentages for each school
print("School Counts:")
for school, count in school_counts.items():
    print(f"{school}: {count} students")

print("\nSchool Percentages:")
for school, percentage in school_percentages.items():
    print(f"{school}: {percentage:.2f}%")

#Calculate probabilities proportional to school sizes
probabilities = {school: len(students[school]) / total_students for school in students}

#Sample
sample_size = 5
sample = random.choices([(student, school) for school in students for student, _ in students[school]],
                         weights=[probabilities[school] for school in students for _ in students[school]],
                         k=sample_size)

#Calculate counts and percentages of students from each school in the sample
sample_counts = {school: sum(1 for _, s in sample if s == school) for school in students}
sample_percentages = {school: (count / sample_size) * 100 for school, count in sample_counts.items()}

print("\nSample School Percentages:")
for school, percentage in sample_percentages.items():
    print(f"{school}: {percentage:.2f}%")
    
print("\nSample:")
for student, school in sample:
    print(f"{student} - {school}")

School Counts:
School A: 8 students
School B: 3 students
School C: 5 students
School D: 4 students

School Percentages:
School A: 40.00%
School B: 15.00%
School C: 25.00%
School D: 20.00%

Sample School Percentages:
School A: 60.00%
School B: 20.00%
School C: 0.00%
School D: 20.00%

Sample:
Chelsie - School A
Sam - School B
Will - School A
Will - School A
Katie - School D
