In [None]:
import pandas as pd
import numpy as np
import random # Needed for setting seed and random start for systematic/cluster

# Set a random seed for reproducible sampling
np.random.seed(42)

# Load your dataset
# Ensure your uploaded file is named 'academic Stress level - maintainance 1.csv'
df = pd.read_csv('academic Stress level - maintainance 1.csv')
df.head()
```

## Part A — Setup
- Report dataset size (rows, columns)
```python
print("Dataset size:", df.shape)
```

## Part B — Simple Random Sampling

```python
sample_size = 50
numeric_col = 'Rate your academic stress index'

# Simple Random Sample
srs = df.sample(n=sample_size, random_state=42)

print("--- Simple Random Sample ---")
print(srs.head())

print("\n--- Mean Comparison ---")
print(f"Population mean ({numeric_col}): {df[numeric_col].mean():.3f}")
print(f"Sample mean ({numeric_col}): {srs[numeric_col].mean():.3f}")
```

## Part C — Systematic Sampling

```python
n = 50
N = len(df)
k = N // n  # Sampling interval (140 // 50 = 2)
start = np.random.randint(0, k) # Random start index (0 or 1)

# Select every kth element starting from the random start index.
# We use [:n] to ensure we only select exactly 50 rows.
sys_sample = df.iloc[start::k][:n]

print(f"Systematic Sample Size: {len(sys_sample)}")
sys_sample.head()
```

## Part D — Stratified Sampling

```python
strata_col = "Your Academic Stage"  # Stratify by Academic Stage
sample_size = 50

# proportional fraction for each group
frac = sample_size / len(df)

# stratified sample
# Group by Academic Stage and sample proportionally from each group
stratified_sample = df.groupby(strata_col, group_keys=False).sample(frac=frac, random_state=42)

print(f"Stratified Sample Size: {len(stratified_sample)} (expected close to {sample_size})")
print("\nSample distribution by Academic Stage (Should mirror population distribution):")
print(stratified_sample[strata_col].value_counts(normalize=True).mul(100).round(1))

stratified_sample.head()
```

## Part E — Cluster Sampling

```python
# Create 10 clusters (140 / 10 = 14 rows per cluster)
df['cluster_id'] = df.index // (len(df)//10)

# Randomly select 2 out of the 10 clusters
num_selected_clusters = 2
selected_clusters = np.random.choice(df['cluster_id'].unique(), size=num_selected_clusters, replace=False)

# Select all rows belonging to the chosen clusters
cluster_sample = df[df['cluster_id'].isin(selected_clusters)]

print("Selected clusters:", selected_clusters)
print(f"Cluster Sample Size: {len(cluster_sample)} (Expected: {num_selected_clusters * (len(df)//10)} = 28)")
cluster_sample.head()
```

## Part F — Comparison & Reflection

```python
numeric_col = 'Rate your academic stress index'

# Calculate means
srs_mean = srs[numeric_col].mean()
sys_mean = sys_sample[numeric_col].mean()
strat_mean = stratified_sample[numeric_col].mean()
cluster_mean = cluster_sample[numeric_col].mean()
population_mean = df[numeric_col].mean()

# Comparison table
comparison = pd.DataFrame({
    'Sampling Method': ['Population', 'Simple Random', 'Systematic', 'Stratified', 'Cluster'],
    f'Mean ({numeric_col})': [population_mean, srs_mean, sys_mean, strat_mean, cluster_mean]
})
comparison['Absolute Error'] = abs(comparison[f'Mean ({numeric_col})'] - population_mean)

print("--- Comparison of Sample Means vs Population Mean ---")
print(comparison.round(3).sort_values(by='Absolute Error'))

# Reflection
print("\n--- Reflection ---")
print(f"The population mean for the Stress Index is {population_mean:.3f}. \
For this specific run, the method that produced the closest estimate (lowest absolute error) \
was likely Stratified Sampling, as it explicitly accounts for the variance introduced by the \
'Academic Stage' (High School, Undergraduate, Postgraduate), which is expected to influence stress levels.")
print("Simple Random Sampling is the easiest to implement, while Systematic Sampling is efficient for ordered data.")
print("Cluster Sampling, despite having a larger overall sample size (56 vs 50), carries the highest risk of error because it only samples from 2 out of the 10 sequential clusters, potentially missing certain trends in the dataset ordering.")
