In [1]:
import pandas as pd
import numpy as np
from samplics import SelectMethod
from samplics.sampling import SampleSelection

import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

%matplotlib inline
#setting seed foor reproducability
np.random.seed(42)

ModuleNotFoundError: No module named 'samplics'

In [2]:
#load in population grid info. 
PROJECT_ROOT = Path().absolute().parent
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
TEMP_DATA_DIR = DATA_DIR / 'temp'


sampling_frame_25km=pd.read_csv(PROCESSED_DATA_DIR/'sampling_frame_25km.csv')
sampling_frame_25km

Unnamed: 0.1,Unnamed: 0,count,sum,nodata,grid_id,station_name,buffer_km,valid_pixels,nodata_pixels,total_pixels,prop_nodata,population_count,household_count
0,1263,24,224.967194,97.0,22475,Aisa FM,25.0,24,97.0,121.0,0.801653,224.967194,45.911672
1,1264,59,626.496155,62.0,22305,Aisa FM,25.0,59,62.0,121.0,0.512397,626.496155,127.856358
2,1265,78,853.466797,43.0,22135,Aisa FM,25.0,78,43.0,121.0,0.355372,853.466797,174.176897
3,1266,70,734.965271,51.0,22476,Aisa FM,25.0,70,51.0,121.0,0.421488,734.965271,149.992912
4,1267,50,559.832336,71.0,22306,Aisa FM,25.0,50,71.0,121.0,0.586777,559.832336,114.251497
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4780,9646,66,340.077698,55.0,12370,Dokolo FM,25.0,66,55.0,121.0,0.454545,340.077698,69.403612
4781,9647,51,233.072800,70.0,12200,Dokolo FM,25.0,51,70.0,121.0,0.578512,233.072800,47.565877
4782,9648,82,412.326538,39.0,12030,Dokolo FM,25.0,82,39.0,121.0,0.322314,412.326538,84.148273
4783,9649,56,270.777649,65.0,12201,Dokolo FM,25.0,56,65.0,121.0,0.537190,270.777649,55.260745


In [3]:
psu_frame=sampling_frame_25km[['grid_id','station_name', 'population_count', 'household_count', 'buffer_km']]

#get the radio station.
no_of_clusters=70 #35 main 35 replacement

#station_names=psu_frame.station_name.unique().
psu_sample_size = {station: no_of_clusters for station in psu_frame.station_name.unique()}
print(f"\nThe number of clusters per stratum (radio staton) is:\n {psu_sample_size}")
psu_frame



The number of clusters per stratum (radio staton) is:
 {'Aisa FM': 70, 'Dwanwana FM': 70, 'Dokolo FM': 70}


Unnamed: 0,grid_id,station_name,population_count,household_count,buffer_km
0,22475,Aisa FM,224.967194,45.911672,25.0
1,22305,Aisa FM,626.496155,127.856358,25.0
2,22135,Aisa FM,853.466797,174.176897,25.0
3,22476,Aisa FM,734.965271,149.992912,25.0
4,22306,Aisa FM,559.832336,114.251497,25.0
...,...,...,...,...,...
4780,12370,Dokolo FM,340.077698,69.403612,25.0
4781,12200,Dokolo FM,233.072800,47.565877,25.0
4782,12030,Dokolo FM,412.326538,84.148273,25.0
4783,12201,Dokolo FM,270.777649,55.260745,25.0


In [4]:
#init a list of dfs. 
_full_frame = []


for station in psu_frame.station_name.unique():
    print(station)
    station_frame=psu_frame[psu_frame.station_name == station].copy()
    np.random.seed(42)
    # Initialize the sampler for  PPS sampling

    stage1_design = SampleSelection(
        method=SelectMethod.pps_sys, #proportional to size
        strat=False,  # not stratifying by radio station (grid_ids overlap)
        wr=False     # sampling without replacement
        )
    
    np.random.seed(42)
    sample=stage1_design.select(samp_unit=station_frame["grid_id"], samp_size=70, mos=station_frame["household_count"],
        to_dataframe=True,
        sample_only=False
    )
    #merge with the frame (check the grid_id field if that is contained)
    sample=sample.merge(station_frame, left_on='_samp_unit', right_on='grid_id')   # per station will ensure that duplicates in grids are retained (as they should be because of overlap)
    # Append to overall results
    _full_frame.append(sample)

final_all = pd.concat(_full_frame, ignore_index=True) #concatening everything. 





print("number of sampled clusters by radio station", final_all.groupby('station_name')._hits.value_counts(dropna=False))
#all sampled clusters only take the sampled ones. 
all_sampled_clusters=final_all.loc[final_all['_hits']==1].copy()


all_sampled_clusters

Aisa FM
Dwanwana FM
Dokolo FM
number of sampled clusters by radio station station_name  _hits
Aisa FM       0        1390
              1          70
Dokolo FM     0        1585
              1          70
Dwanwana FM   0        1600
              1          70
Name: count, dtype: int64


Unnamed: 0,_samp_unit,_mos,_sample,_hits,_probs,grid_id,station_name,population_count,household_count,buffer_km
5,21965,84.087425,True,1,0.045243,21965,Aisa FM,412.028381,84.087425,25.0
17,22136,176.962442,True,1,0.095213,22136,Aisa FM,867.115967,176.962442,25.0
30,21627,274.034050,True,1,0.147442,21627,Aisa FM,1342.766846,274.034050,25.0
41,20438,226.454729,True,1,0.121842,20438,Aisa FM,1109.628174,226.454729,25.0
52,21119,163.725536,True,1,0.088091,21119,Aisa FM,802.255127,163.725536,25.0
...,...,...,...,...,...,...,...,...,...,...
4677,9481,99.663379,True,1,0.074282,9481,Dokolo FM,488.350555,99.663379,25.0
4699,9312,62.132182,True,1,0.046309,9312,Dokolo FM,304.447693,62.132182,25.0
4722,11348,86.700838,True,1,0.064621,11348,Dokolo FM,424.834106,86.700838,25.0
4746,11690,44.783437,True,1,0.033378,11690,Dokolo FM,219.438843,44.783437,25.0


In [5]:
# assign main and replacement (sampled clusters 75 only)
np.random.seed(42)


# create main/replacement assignment within each station
def assign_status(station_group):
    n = len(station_group)
    # Randomly assign status, ensuring exactly 35 for each type
    status = np.array(['main'] * 35 + ['replacement'] * 35)  # Changed to 35 replacement
    np.random.shuffle(status)
    return pd.Series(status, index=station_group.index)


# groupby station and apply the assignment
all_sampled_clusters['cluster_type'] = all_sampled_clusters.groupby('station_name', group_keys=False).apply(assign_status)


print(pd.crosstab(all_sampled_clusters.station_name, all_sampled_clusters.cluster_type))



cluster_type  main  replacement
station_name                   
Aisa FM         35           35
Dokolo FM       35           35
Dwanwana FM     35           35


  all_sampled_clusters['cluster_type'] = all_sampled_clusters.groupby('station_name', group_keys=False).apply(assign_status)


In [6]:


#bit of cleanup

all_sampled_clusters['psu_prob']=all_sampled_clusters['_probs']
all_sampled_clusters['psu_explanation']='1km by 1km grid cells in 25 km radio station range populated >18 households'



#calculate inclusion probabilities for second stage units (households) Needs to be recalculated based on actuals. 

all_sampled_clusters['ssu_prob'] = 12 / all_sampled_clusters['household_count']
all_sampled_clusters['ssu_explanation']='second stage unit=households, inclusion probability 12 households per 1km by 1km grid cell'






In [7]:
all_sampled_clusters[['grid_id','station_name', 'population_count', 'household_count', 'buffer_km',
       'cluster_type', 'psu_prob', 'psu_explanation', 'ssu_prob',
       'ssu_explanation']].to_csv(TEMP_DATA_DIR/'sampled_clusters.csv')
all_sampled_clusters

Unnamed: 0,_samp_unit,_mos,_sample,_hits,_probs,grid_id,station_name,population_count,household_count,buffer_km,cluster_type,psu_prob,psu_explanation,ssu_prob,ssu_explanation
5,21965,84.087425,True,1,0.045243,21965,Aisa FM,412.028381,84.087425,25.0,main,0.045243,1km by 1km grid cells in 25 km radio station r...,0.142709,"second stage unit=households, inclusion probab..."
17,22136,176.962442,True,1,0.095213,22136,Aisa FM,867.115967,176.962442,25.0,main,0.095213,1km by 1km grid cells in 25 km radio station r...,0.067811,"second stage unit=households, inclusion probab..."
30,21627,274.034050,True,1,0.147442,21627,Aisa FM,1342.766846,274.034050,25.0,replacement,0.147442,1km by 1km grid cells in 25 km radio station r...,0.043790,"second stage unit=households, inclusion probab..."
41,20438,226.454729,True,1,0.121842,20438,Aisa FM,1109.628174,226.454729,25.0,main,0.121842,1km by 1km grid cells in 25 km radio station r...,0.052991,"second stage unit=households, inclusion probab..."
52,21119,163.725536,True,1,0.088091,21119,Aisa FM,802.255127,163.725536,25.0,replacement,0.088091,1km by 1km grid cells in 25 km radio station r...,0.073293,"second stage unit=households, inclusion probab..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4677,9481,99.663379,True,1,0.074282,9481,Dokolo FM,488.350555,99.663379,25.0,replacement,0.074282,1km by 1km grid cells in 25 km radio station r...,0.120405,"second stage unit=households, inclusion probab..."
4699,9312,62.132182,True,1,0.046309,9312,Dokolo FM,304.447693,62.132182,25.0,replacement,0.046309,1km by 1km grid cells in 25 km radio station r...,0.193137,"second stage unit=households, inclusion probab..."
4722,11348,86.700838,True,1,0.064621,11348,Dokolo FM,424.834106,86.700838,25.0,replacement,0.064621,1km by 1km grid cells in 25 km radio station r...,0.138407,"second stage unit=households, inclusion probab..."
4746,11690,44.783437,True,1,0.033378,11690,Dokolo FM,219.438843,44.783437,25.0,main,0.033378,1km by 1km grid cells in 25 km radio station r...,0.267956,"second stage unit=households, inclusion probab..."


In [8]:
all_sampled_clusters.columns

Index(['_samp_unit', '_mos', '_sample', '_hits', '_probs', 'grid_id',
       'station_name', 'population_count', 'household_count', 'buffer_km',
       'cluster_type', 'psu_prob', 'psu_explanation', 'ssu_prob',
       'ssu_explanation'],
      dtype='object')