In [2]:
import pandas as pd
from pathlib import Path
import numpy as np

In [3]:
RAW = Path("data/raw")
OUT = Path("data/interim")
OUT.mkdir(parents=True, exist_ok=True)

In [4]:
data = pd.read_csv('/home/rzby/ffmc_dc/data/raw/export-ME48-2015-01-01-2024-01-01.csv', low_memory=False)
data.head()

Unnamed: 0,WMO ID,DATA TIMESTAMP,EVAPORATION 24HOURS MM EEE,RAINFALL 24H RRRR,TEMP DRYBULB C TTTTTT,WIND SPEED FF,RELATIVE HUMIDITY PC,PRESSURE QFF MB DERIVED
0,96595,2015-01-01 00:00:00.0 +0:00,7.0,7.6,23.4,0.0,98.3,1012.0
1,96595,2015-01-01 01:00:00.0 +0:00,,,24.6,0.0,92.8,1012.2
2,96595,2015-01-01 02:00:00.0 +0:00,,,27.0,0.0,80.8,1012.0
3,96595,2015-01-01 03:00:00.0 +0:00,,,29.8,0.0,71.2,1011.0
4,96595,2015-01-01 04:00:00.0 +0:00,,,30.7,0.0,65.4,1011.0


In [5]:
data['WMO ID'].unique()

array([96595, 96645, 96651, 96653, 96655])

In [6]:
data['WMO ID'] = data['WMO ID'].astype(str)

In [7]:
def stratified_sample(df, stratify_col, n, random_state=None):
    """
    Take a stratified sample of n rows from a pandas DataFrame, 
    maintaining proportions of categories in the specified column.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        stratify_col (str): Column name to stratify by.
        n (int): Desired sample size (will be capped at len(df)).
        random_state (int, optional): Seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled DataFrame with n rows.
    """
    n = min(n, len(df))
    if n == 0:
        return df.iloc[0:0].copy()
    
    groups = df.groupby(stratify_col, sort=False, group_keys=False)
    group_counts = groups.size()
    proportions = group_counts / group_counts.sum()
    target_float = n * proportions
    
    # Initial allocation (floor)
    target_per_group = np.floor(target_float).astype(int)
    remainder = n - target_per_group.sum()
    
    # Distribute remainder to groups with largest fractional parts
    fractional = target_float - target_per_group
    eligible_mask = target_per_group < group_counts
    eligible_groups = fractional[eligible_mask].sort_values(ascending=False)
    
    if remainder > 0 and not eligible_groups.empty:
        num_to_add = min(remainder, len(eligible_groups))
        groups_to_add = eligible_groups.head(num_to_add).index
        target_per_group[groups_to_add] += 1
    
    # Sample from each group
    samples = []
    for name, size in target_per_group.items():
        if size == 0:
            continue
        group_df = groups.get_group(name)
        samples.append(group_df.sample(n=min(size, len(group_df)), 
                          random_state=random_state))
    
    return pd.concat(samples, ignore_index=False)


In [9]:
sampled_df = stratified_sample(data, 'WMO ID', n=100, random_state=21)
sampled_df

Unnamed: 0,WMO ID,DATA TIMESTAMP,EVAPORATION 24HOURS MM EEE,RAINFALL 24H RRRR,TEMP DRYBULB C TTTTTT,WIND SPEED FF,RELATIVE HUMIDITY PC,PRESSURE QFF MB DERIVED
2728,96595,2015-06-20 11:00:00.0 +0:00,,,28.8,0.0,75.5,1009.5
39499,96595,2020-11-02 10:00:00.0 +0:00,,,29.4,0.0,84.0,1009.3
2059,96595,2015-05-09 14:00:00.0 +0:00,,,25.0,0.0,95.2,1011.4
43536,96595,2021-04-19 14:00:00.0 +0:00,,,25.0,0.0,94.0,1010.4
49040,96595,2021-12-04 21:00:00.0 +0:00,,,24.6,0.0,93.0,1010.3
...,...,...,...,...,...,...,...,...
334028,96655,2021-12-24 10:00:00.0 +0:00,,,29.3,2.0,77.0,1006.7
295256,96655,2017-07-23 20:00:00.0 +0:00,,,23.6,3.0,91.8,1012.2
336190,96655,2022-03-24 12:00:00.0 +0:00,,,29.3,3.0,84.0,1007.6
313831,96655,2019-09-05 02:00:00.0 +0:00,,,27.9,4.0,72.0,1014.7


In [10]:
sampled_df.to_csv('/home/rzby/ffmc_dc/src/sample_df.csv', index=False)