In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/SampleSuperstore.csv')


df.shape
df.head(500)


Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.9600,2,0.00,41.9136
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.9400,3,0.00,219.5820
2,Second Class,Corporate,United States,Los Angeles,California,90036,West,Office Supplies,Labels,14.6200,2,0.00,6.8714
3,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Furniture,Tables,957.5775,5,0.45,-383.0310
4,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Office Supplies,Storage,22.3680,2,0.20,2.5164
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Standard Class,Consumer,United States,Fayetteville,Arkansas,72701,South,Office Supplies,Envelopes,105.4200,2,0.00,51.6558
496,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Office Supplies,Binders,119.6160,8,0.20,40.3704
497,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Furniture,Furnishings,255.7600,4,0.00,81.8432
498,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Furniture,Chairs,241.5680,2,0.20,18.1176


## 1. Sumarize function

In [40]:


def summarize(df, name):
    summary = {}


    summary["dataset_name"] = name
    summary["total_rows"] = len(df)
    summary["total_columns"] = df.shape[1]


    summary["total_sales"] = df["Sales"].sum()
    summary["average_sales"] = df["Sales"].mean()
    summary["median_sales"] = df["Sales"].median()


    summary["total_profit"] = df["Profit"].sum()
    summary["average_profit"] = df["Profit"].mean()


    summary["total_quantity"] = df["Quantity"].sum()
    summary["average_discount"] = df["Discount"].mean()


    summary["segment_distribution"] = df["Segment"].value_counts(normalize=True).round(2).to_dict()
    summary["region_distribution"] = df["Region"].value_counts(normalize=True).round(2).to_dict()
    summary["category_distribution"] = df["Category"].value_counts(normalize=True).round(2).to_dict()


    summary["top_5_states"] = df["State"].value_counts().head(5).to_dict()
    summary["top_5_cities"] = df["City"].value_counts().head(5).to_dict()

    return summary

s = summarize(df,"test")
s





{'dataset_name': 'test',
 'total_rows': 9994,
 'total_columns': 13,
 'total_sales': np.float64(2297200.8603000003),
 'average_sales': np.float64(229.85800083049833),
 'median_sales': 54.489999999999995,
 'total_profit': np.float64(286397.0217),
 'average_profit': np.float64(28.65689630778467),
 'total_quantity': np.int64(37873),
 'average_discount': np.float64(0.15620272163297977),
 'segment_distribution': {'Consumer': 0.52,
  'Corporate': 0.3,
  'Home Office': 0.18},
 'region_distribution': {'West': 0.32,
  'East': 0.28,
  'Central': 0.23,
  'South': 0.16},
 'category_distribution': {'Office Supplies': 0.6,
  'Furniture': 0.21,
  'Technology': 0.18},
 'top_5_states': {'California': 2001,
  'New York': 1128,
  'Texas': 985,
  'Pennsylvania': 587,
  'Washington': 506},
 'top_5_cities': {'New York City': 915,
  'Los Angeles': 747,
  'Philadelphia': 537,
  'San Francisco': 510,
  'Seattle': 428}}

## 2. Probability Sample

In [None]:
def simple_random_sampling(df, n, seed=42):
    return df.sample(n=n, random_state=seed).reset_index(drop=True)



def systematic_sampling(df, n, seed=42):

    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    step = len(df) // n
    if step == 0:
        step = 1

    sample = df.iloc[::step].head(n)

    return sample.reset_index(drop=True)


def stratified_sampling(df, column, n, seed=42):
    parts = []
    total_rows = len(df)

    for value in df[column].unique():
        print(value)
        group = df[df[column] == value]
        print(group)
        size = int((len(group) / total_rows) * n)

        if size > 0:
            part = group.sample(
                n=min(size, len(group)),
                random_state=seed
            )
            parts.append(part)

    return pd.concat(parts).sample(frac=1, random_state=seed).reset_index(drop=True)

def cluster_sampling(df, column, n_clusters, seed=42):
    clusters = df[column].unique()
    chosen = np.random.choice(
        clusters,
        size=min(n_clusters, len(clusters)),
        replace=False
    )
    return df[df[column].isin(chosen)].reset_index(drop=True)



## 3.Non Population Sample

In [None]:
from typing import Dict, List

def convenience_sampling(df: pd.DataFrame, n: int, sort_by: str | None = None) -> pd.DataFrame:
    d = df.copy()
    if sort_by is not None and sort_by in d.columns:
        d = d.sort_values(sort_by)
    return d.head(n).reset_index(drop=True)



def quota_sampling(
    df: pd.DataFrame,
    quotas: Dict[str, Dict[str, int]],
    seed: int = 42
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    parts = []

    for col, qmap in quotas.items():
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in df.")

        for value, k in qmap.items():
            group = df[df[col] == value]
            k = min(int(k), len(group))
            if k <= 0:
                continue

            idx = rng.choice(group.index.to_numpy(), size=k, replace=False)
            parts.append(df.loc[idx])

    if not parts:
        return df.head(0).copy()

    sample = (
        pd.concat(parts, ignore_index=False)
          .drop_duplicates()  # safe if quotas overlap multiple columns
          .sample(frac=1, random_state=seed)
          .reset_index(drop=True)
    )
    return sample



def snowball_sampling(
    df: pd.DataFrame,
    seed_ids: List[int],
    n: int,
    seed: int = 42,
    id_col: str = "__row_id__"
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)

    # create stable IDs for rows
    d = df.copy().reset_index(drop=True)
    d[id_col] = np.arange(len(d))

    ids = d[id_col].to_numpy()

    adjacency = {}
    for pid in ids:
        deg = int(rng.integers(3, 9))
        neighbors = rng.choice(ids, size=min(deg, len(ids)), replace=False)
        adjacency[int(pid)] = neighbors.astype(int).tolist()

    selected = []
    queue = [int(x) for x in seed_ids if int(x) in adjacency]
    seen = set()

    while queue and len(selected) < n:
        pid = queue.pop(0)
        if pid in seen:
            continue
        seen.add(pid)
        selected.append(pid)

        for nb in adjacency.get(pid, []):
            if nb not in seen:
                queue.append(nb)

    sample = d[d[id_col].isin(selected)].drop(columns=[id_col]).reset_index(drop=True)
    return sample




## 4. Sample Summerization

In [None]:
SAMPLE_N = 500
SEED = 42

# Probability sampling
srs = simple_random_sampling(df, SAMPLE_N, seed=SEED)
sys = systematic_sampling(df, SAMPLE_N, seed=SEED)
strat = stratified_sampling(df, "Region", SAMPLE_N, seed=SEED)

# Cluster sampling (by City)
cluster = cluster_sampling(df, "City", n_clusters=2, seed=SEED)

# Non-probability sampling
conv = convenience_sampling(df, SAMPLE_N)

quota = quota_sampling(
    df,
    quotas={
        "Segment": {
            "Consumer": 250,
            "Corporate": 150,
            "Home Office": 100
        }
    },
    seed=SEED
)

snow = snowball_sampling(
    df,
    seed_ids=[10, 200, 999],  # row IDs, NOT person IDs
    n=SAMPLE_N,
    seed=SEED
)


samples = {
    "Population": df,
    "Simple Random": srs,
    "Systematic": sys,
    "Stratified (Region)": strat,
    "Cluster (2 Cities)": cluster,
    "Convenience": conv,
    "Quota (Segment)": quota,
    "Snowball (Simulated)": snow,
}


summaries = pd.DataFrame(
    [summarize(sample_df, name) for name, sample_df in samples.items()]
)

summaries


Unnamed: 0,dataset_name,total_rows,total_columns,total_sales,average_sales,median_sales,total_profit,average_profit,total_quantity,average_discount,segment_distribution,region_distribution,category_distribution,top_5_states,top_5_cities
0,Population,9994,13,2297201.0,229.858001,54.49,286397.0217,28.656896,37873,0.156203,"{'Consumer': 0.52, 'Corporate': 0.3, 'Home Off...","{'West': 0.32, 'East': 0.28, 'Central': 0.23, ...","{'Office Supplies': 0.6, 'Furniture': 0.21, 'T...","{'California': 2001, 'New York': 1128, 'Texas'...","{'New York City': 915, 'Los Angeles': 747, 'Ph..."
1,Simple Random,500,13,112070.1,224.140239,47.984,11020.5997,22.041199,1909,0.15448,"{'Consumer': 0.47, 'Corporate': 0.34, 'Home Of...","{'West': 0.31, 'East': 0.25, 'Central': 0.23, ...","{'Office Supplies': 0.62, 'Furniture': 0.2, 'T...","{'California': 97, 'New York': 51, 'Texas': 47...","{'New York City': 42, 'Los Angeles': 38, 'Phil..."
2,Systematic,500,13,113538.2,227.076408,54.406,15036.9284,30.073857,1898,0.1558,"{'Consumer': 0.52, 'Corporate': 0.29, 'Home Of...","{'West': 0.33, 'East': 0.27, 'Central': 0.22, ...","{'Office Supplies': 0.61, 'Furniture': 0.21, '...","{'California': 95, 'New York': 56, 'Texas': 52...","{'New York City': 45, 'Los Angeles': 36, 'Phil..."
3,Stratified (Region),499,13,104471.3,209.361364,60.6,15965.2434,31.994476,1872,0.1299,"{'Consumer': 0.53, 'Corporate': 0.28, 'Home Of...","{'West': 0.32, 'East': 0.28, 'Central': 0.23, ...","{'Office Supplies': 0.57, 'Technology': 0.24, ...","{'California': 100, 'New York': 58, 'Texas': 4...","{'New York City': 47, 'Los Angeles': 35, 'Phil..."
4,Cluster (2 Cities),11,13,2822.7,256.609091,83.92,1140.9757,103.725064,38,0.0,"{'Consumer': 0.45, 'Corporate': 0.45, 'Home Of...","{'Central': 0.55, 'East': 0.45}","{'Office Supplies': 0.64, 'Technology': 0.27, ...","{'Michigan': 6, 'Rhode Island': 5}","{'Saginaw': 6, 'Warwick': 5}"
5,Convenience,500,13,129426.3,258.852632,67.162,5363.511,10.727022,1973,0.17062,"{'Consumer': 0.54, 'Corporate': 0.3, 'Home Off...","{'East': 0.31, 'West': 0.31, 'Central': 0.27, ...","{'Office Supplies': 0.59, 'Furniture': 0.22, '...","{'California': 102, 'New York': 61, 'Texas': 4...","{'New York City': 43, 'Los Angeles': 38, 'San ..."
6,Quota (Segment),500,13,134311.1,268.622166,57.016,22982.9659,45.965932,1822,0.15278,"{'Consumer': 0.5, 'Corporate': 0.3, 'Home Offi...","{'West': 0.31, 'East': 0.28, 'Central': 0.24, ...","{'Office Supplies': 0.63, 'Furniture': 0.21, '...","{'California': 87, 'New York': 54, 'Texas': 48...","{'New York City': 40, 'Los Angeles': 35, 'San ..."
7,Snowball (Simulated),500,13,112622.4,225.244857,53.24,8937.7271,17.875454,1795,0.1489,"{'Consumer': 0.52, 'Corporate': 0.29, 'Home Of...","{'West': 0.34, 'East': 0.28, 'Central': 0.24, ...","{'Office Supplies': 0.61, 'Furniture': 0.23, '...","{'California': 106, 'Texas': 51, 'New York': 4...","{'New York City': 40, 'Philadelphia': 32, 'Los..."


## 5. Dristribution and Normalization

In [None]:
def dist_table(df: pd.DataFrame, col: str) -> pd.Series:

    return df[col].value_counts(normalize=True).rename("prop")



def compare_dist(df: pd.DataFrame, samples: dict, col: str) -> pd.DataFrame:

    base = dist_table(df, col).rename("Population")
    out = base.to_frame()

    for name, sample_df in samples.items():
        out[name] = dist_table(sample_df, col)

    return out.fillna(0).round(3)



segment_cmp = compare_dist(df, samples, "Segment")
segment_cmp

region_cmp = compare_dist(df, samples, "Region")
region_cmp


category_cmp = compare_dist(df, samples, "Category")
category_cmp


city_cmp = compare_dist(df, samples, "City").head(10)
city_cmp




Unnamed: 0_level_0,Population,Simple Random,Systematic,Stratified (Region),Cluster (2 Cities),Convenience,Quota (Segment),Snowball (Simulated)
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Consumer,0.519,0.468,0.524,0.531,0.651,0.54,0.5,0.518
Corporate,0.302,0.34,0.286,0.285,0.279,0.298,0.3,0.294
Home Office,0.178,0.192,0.19,0.184,0.07,0.162,0.2,0.188


## 6 Mean Error

In [None]:
def mean_error_table(df: pd.DataFrame, samples: dict, col: str) -> pd.DataFrame:
    pop_mean = df[col].mean()

    errors = []
    for name, sample_df in samples.items():
        err = sample_df[col].mean() - pop_mean
        errors.append({
            "sample": name,
            "n": len(sample_df),
            f"mean_{col}_error": round(float(err), 2)
        })

    return pd.DataFrame(errors).sort_values(f"mean_{col}_error")


In [None]:
sales_error_df = mean_error_table(df, samples, "Sales")
sales_error_df


Unnamed: 0,sample,n,mean_Sales_error
4,Cluster (2 Cities),43,-85.98
3,Stratified (Region),499,-20.5
1,Simple Random,500,-5.72
7,Snowball (Simulated),500,-4.61
2,Systematic,500,-2.78
0,Population,9994,0.0
5,Convenience,500,28.99
6,Quota (Segment),500,38.76


## Systematic sampling showed the lowest bias in mean sales estimation