Steps:
1. Make blocks from GFF
2. Assign each SNP to a block
3. Use indices of GT to make a DataFrame per block, and remove missing values
4. Use sampling state to sample two haplotypes from each block DF
5. Count number of segregating sites between blocks

In [77]:
import pandas as pd
import numpy as np
import random
import allel
import zarr
import itertools
import requests
import os

## (1) Make blocks from GFF

In [24]:
gff3 = allel.gff3_to_dataframe("/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3")
gff3

Unnamed: 0,seqid,source,type,start,end,score,strand,phase
0,brenthis_ino.SP_BI_364.chromosome_1,AUGUSTUS,gene,18787,19695,0.60,-,-1
1,brenthis_ino.SP_BI_364.chromosome_1,AUGUSTUS,mRNA,18787,19695,0.60,-,-1
2,brenthis_ino.SP_BI_364.chromosome_1,AUGUSTUS,stop_codon,18787,18789,-1.00,-,0
3,brenthis_ino.SP_BI_364.chromosome_1,AUGUSTUS,CDS,18790,19695,0.60,-,0
4,brenthis_ino.SP_BI_364.chromosome_1,AUGUSTUS,exon,18790,19695,-1.00,-,-1
...,...,...,...,...,...,...,...,...
483674,brenthis_ino.SP_BI_364.scaffold_9,AUGUSTUS,exon,46784,47186,-1.00,-,-1
483675,brenthis_ino.SP_BI_364.scaffold_9,AUGUSTUS,intron,47187,53566,0.88,-,-1
483676,brenthis_ino.SP_BI_364.scaffold_9,AUGUSTUS,CDS,53567,54497,0.89,-,0
483677,brenthis_ino.SP_BI_364.scaffold_9,AUGUSTUS,exon,53567,54497,-1.00,-,-1


In [34]:
def filter_ld(df, ld_bp):
    filtered = np.where([df.iloc[row, 1] - df.iloc[row-1, 2] > ld_bp for row in range(1, len(df)-1)])
    return df.iloc[filtered[0], :]

In [33]:
gff1000 = allel.gff3_to_dataframe("../test_data/brenthis_head1000.gff3")
introns_df = gff1000[gff1000["type"] == "intron"]
blocklen=100
trim_blocks(introns_df[(introns_df["end"] - introns_df["start"]) > blocklen], blocklen)

Unnamed: 0,chr,start,end
17,brenthis_ino.SP_BI_364.chromosome_1,67375,67476
20,brenthis_ino.SP_BI_364.chromosome_1,68439,68540
23,brenthis_ino.SP_BI_364.chromosome_1,69411,69512
26,brenthis_ino.SP_BI_364.chromosome_1,70046,70147
29,brenthis_ino.SP_BI_364.chromosome_1,71017,71118
...,...,...,...
920,brenthis_ino.SP_BI_364.chromosome_1,599278,599379
929,brenthis_ino.SP_BI_364.chromosome_1,601570,601671
932,brenthis_ino.SP_BI_364.chromosome_1,602531,602632
935,brenthis_ino.SP_BI_364.chromosome_1,602921,603022


In [75]:
def _trim_blocks(df, blocklen):
    start_indices = [random.randint(df.iloc[row, 3], (df.iloc[row, 4]-blocklen)) for row in range(0, len(df))]
    end_indices = np.array(start_indices) + blocklen+1
    return pd.DataFrame({"chr":df.iloc[:,0], "start":start_indices, "end":end_indices})

def _filter_ld(df, ld_bp):
    sorted = df.sort_values(["chr", "start"])

    filtered_dfs = []
    for chromosome in df["chr"].unique():
        df_chr = sorted[sorted["chr"] == chromosome]
        filtered = np.where([sorted.iloc[row, 1] - df_chr.iloc[row-1, 2] > ld_bp for row in range(1, len(df_chr)-1)])
        filtered_dfs.append(df_chr.iloc[filtered[0], :])
    
    return pd.concat(filtered_dfs)


def blocks_from_gff3(gff3_path, blocklen, ld_dist_bp):
    """Subset GFF3 file for intronic blocks of length blocklen, at least ld_dist apart"""

    pd.set_option('chained_assignment',None)
    
    gff = allel.gff3_to_dataframe(gff3_path)
    
    # Subset intronic regions
    introns_df = gff[gff["type"] == "intron"]

    # Subset len > blocklen, and trim to blocklen
    blocks_df = _trim_blocks(introns_df[(introns_df["end"] - introns_df["start"]) > blocklen], blocklen)

    # Filter for LD
    blocks_ld_filtered = _filter_ld(blocks_df, ld_dist_bp)

    # Include block information in output df
    blocks_ld_filtered.loc[:, "block_id"] = blocks_ld_filtered.loc[:,"chr"] + ":" + blocks_ld_filtered.loc[:,"start"].astype(str) + "-" + blocks_ld_filtered.loc[:,"end"].astype(str)

    return blocks_ld_filtered


gff_df = blocks_from_gff3("../test_data/brenthis_head1000.gff3",
            blocklen=100, ld_dist_bp=10000)

In [76]:
gff_df

Unnamed: 0,chr,start,end,block_id
35,brenthis_ino.SP_BI_364.chromosome_1,73731,73832,brenthis_ino.SP_BI_364.chromosome_1:73731-73832
47,brenthis_ino.SP_BI_364.chromosome_1,126210,126311,brenthis_ino.SP_BI_364.chromosome_1:126210-126311
50,brenthis_ino.SP_BI_364.chromosome_1,141537,141638,brenthis_ino.SP_BI_364.chromosome_1:141537-141638
127,brenthis_ino.SP_BI_364.chromosome_1,237755,237856,brenthis_ino.SP_BI_364.chromosome_1:237755-237856
187,brenthis_ino.SP_BI_364.chromosome_1,280458,280559,brenthis_ino.SP_BI_364.chromosome_1:280458-280559
353,brenthis_ino.SP_BI_364.chromosome_1,337149,337250,brenthis_ino.SP_BI_364.chromosome_1:337149-337250
359,brenthis_ino.SP_BI_364.chromosome_1,351434,351535,brenthis_ino.SP_BI_364.chromosome_1:351434-351535
368,brenthis_ino.SP_BI_364.chromosome_1,365089,365190,brenthis_ino.SP_BI_364.chromosome_1:365089-365190
508,brenthis_ino.SP_BI_364.chromosome_1,419328,419429,brenthis_ino.SP_BI_364.chromosome_1:419328-419429
589,brenthis_ino.SP_BI_364.chromosome_1,469663,469764,brenthis_ino.SP_BI_364.chromosome_1:469663-469764


In [67]:
block_dist = [gff_df.iloc[row, 1]-gff_df.iloc[row-1, 2] for row in range(1, len(gff_df))]
block_dist

[58875, 105057, 42817, 56447, 15058, 12910, 58498, 45001, 12193, 47381]

In [73]:
distance_between_blocks = [gff_df.iloc[row, 1] -
                               gff_df.iloc[row-1, 2] for row in range(1, len(gff_df))]

assert (np.array(distance_between_blocks) > 10000).all()


[58875, 105057, 42817, 56447, 15058, 12910, 58498, 45001, 12193, 47381]


In [18]:
subset_gff3(requests.get("https://www.dropbox.com/scl/fi/wyew115q4j4xqh5kh973r/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3?rlkey=sq4tmjgmtuhx54v6iql9o8e8t&dl=0"),
            100, 1000)

AttributeError: 'Response' object has no attribute 'endswith'

In [7]:
gff = pd.read_table("/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3",
               nrows=1000,
               usecols=[0, 2, 3, 4],
               sep="\t",
               comment="#",
               header=None)

gff.columns = ["chr", "feature", "start", "end"]

def subset_introns(gff):
    return gff[gff.iloc[:,1] == "intron"]

def subset_length(gff, length):
    return gff[gff.iloc[:,3] - gff.iloc[:,2] > length]

def filter_ld(gff, ld_bp, sort=True):
    #if sort:
     #   gff = gff.sort_values(by=["start", "end"])
    
    filtered = np.where([gff.iloc[row, -2] - gff.iloc[row-1, -1] > ld_bp for row in range(1, len(gff)-1)])

    return gff.iloc[filtered[0], :]

def trim_blocks(gff, length):
    start_indices = [random.randint(gff.iloc[row, -2], (gff.iloc[row, -1]-length)) for row in range(0, len(gff))]
    end_indices = np.array(start_indices) + length+1
    return pd.DataFrame({"chr":gff.iloc[:,0], "start":start_indices, "end":end_indices})

def make_block_indices(gff, blocklength, ld_bp, nrows=None):

    gff = pd.read_table(gff,
               nrows=nrows,
               usecols=[0, 2, 3, 4],
               sep="\t",
               comment="#",
               header=None)

    gff.columns = ["chr", "feature", "start", "end"]

    introns = subset_introns(gff)
    introns_filtered_len = subset_length(introns, blocklength)
    introns_filtered_ld = filter_ld(introns_filtered_len, ld_bp)
    block_indices = trim_blocks(introns_filtered_ld, blocklength)
    block_indices["block_id"] = block_indices["chr"] + ":" + block_indices["start"].astype(str) + "-" + block_indices["end"].astype(str)

    return block_indices

blocks = make_block_indices("/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3", 100, 10000)
blocks

Unnamed: 0,chr,start,end,block_id
35,brenthis_ino.SP_BI_364.chromosome_1,73671,73772,brenthis_ino.SP_BI_364.chromosome_1:73671-73772
50,brenthis_ino.SP_BI_364.chromosome_1,143980,144081,brenthis_ino.SP_BI_364.chromosome_1:143980-144081
127,brenthis_ino.SP_BI_364.chromosome_1,237688,237789,brenthis_ino.SP_BI_364.chromosome_1:237688-237789
187,brenthis_ino.SP_BI_364.chromosome_1,280200,280301,brenthis_ino.SP_BI_364.chromosome_1:280200-280301
359,brenthis_ino.SP_BI_364.chromosome_1,352222,352323,brenthis_ino.SP_BI_364.chromosome_1:352222-352323
...,...,...,...,...
483342,brenthis_ino.SP_BI_364.scaffold_2,1972,2073,brenthis_ino.SP_BI_364.scaffold_2:1972-2073
483447,brenthis_ino.SP_BI_364.scaffold_5,12075,12176,brenthis_ino.SP_BI_364.scaffold_5:12075-12176
483477,brenthis_ino.SP_BI_364.scaffold_5,45474,45575,brenthis_ino.SP_BI_364.scaffold_5:45474-45575
483621,brenthis_ino.SP_BI_364.scaffold_8,40533,40634,brenthis_ino.SP_BI_364.scaffold_8:40533-40634


## (2) Assign each SNP to a block

In [3]:
vcf_path = "/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino_daphne.vcf.gz"
zarr_path = "/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/zarrstore"

try:
    allel.vcf_to_zarr(vcf_path, zarr_path,
                          fields=["samples", "calldata/GT", "variants/CHROM", "variants/POS"], overwrite=False)
except Exception: # if store already exists, proceed
    pass 

callset = zarr.open_group(zarr_path, mode='r')
chromosomes = callset["variants/CHROM"][:]
samples = callset["samples"][:]
gt = callset["calldata/GT"][:]
pos = callset["variants/POS"][:]

callset_positions = pd.DataFrame({"chr":chromosomes, "pos":pos})



In [89]:
class CallSet:
    """Class to represent a skallele callset (i.e. a VCF)"""

    def __init__(self, vcf_path=None, zarr_path=None):
        
        if zarr_path is None:
            self.zarr_path = "zarrstore"
        else:
            self.zarr_path = zarr_path

        
        if vcf_path is None:
            assert zarr_path is not None

        self.vcf_path = vcf_path
        if not os.path.exists(self.zarr_path):
            allel.vcf_to_zarr(self.vcf_path, self.zarr_path,
                          fields=["samples", "calldata/GT", "variants/CHROM", "variants/POS"], overwrite=False)
            
        
        self.callset = zarr.open_group(zarr_path, mode='r')
        self.chromosomes = self.callset["variants/CHROM"][:]
        self.samples = self.callset["samples"][:]
        self.gt = self.callset["calldata/GT"][:]
        self.pos = self.callset["variants/POS"][:]
        self.callset_positions_df = pd.DataFrame({"chr":self.chromosomes, "pos":self.pos})
       

In [127]:
callset = CallSet(vcf_path="../test_data/brenthis_500K.vcf.gz", 
        zarr_path="../test_data/zarrstore")

In [128]:
zarr_path = "/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/zarrstore"
os.path.exists(zarr_path)


True

In [129]:
def get_block_snps(callset, blocks):
    """Create DataFrame of SNPs and which block they belong to [chr, pos, block, gt_idx]"""
    positions = callset.callset_positions_df
    positions["gt_idx"] = positions.index

    blocks_snps_df = pd.concat([positions[positions["pos"].between(blocks.iloc[row, 1],
                                                                blocks.iloc[row, 2])]
                                                                  for row in range(len(blocks))])
  
    snp_pos_block = []

    blocks_snps_df_ = blocks_snps_df.reset_index()

    for row in range(len(blocks_snps_df_)):
      position = blocks_snps_df_.iloc[row, 2]
      snp_block = blocks[(blocks["start"].astype(int) <= position) 
                         & (blocks["end"].astype(int) >= position)][["chr", "block_id"]]
      snp_pos_block.append((snp_block.iloc[0][0], position, snp_block.iloc[0][1]))

    snp_blocks = pd.DataFrame(snp_pos_block)
    snp_blocks.columns = ["chr", "pos", "block_id"]

    snp_blocks = snp_blocks.merge(positions, on=["chr", "pos"]).drop_duplicates()
      
    return snp_blocks

get_block_snps(callset, blocks)

Unnamed: 0,chr,pos,block_id,gt_idx
0,brenthis_ino.SP_BI_364.chromosome_1,73671,brenthis_ino.SP_BI_364.chromosome_1:73671-73772,1594
1,brenthis_ino.SP_BI_364.chromosome_1,73696,brenthis_ino.SP_BI_364.chromosome_1:73671-73772,1595
2,brenthis_ino.SP_BI_364.chromosome_1,73762,brenthis_ino.SP_BI_364.chromosome_1:73671-73772,1596
3,brenthis_ino.SP_BI_364.chromosome_1,143980,brenthis_ino.SP_BI_364.chromosome_1:143980-144081,6736
4,brenthis_ino.SP_BI_364.chromosome_1,144001,brenthis_ino.SP_BI_364.chromosome_1:143980-144081,6737
...,...,...,...,...
57,brenthis_ino.SP_BI_364.chromosome_1,469439,brenthis_ino.SP_BI_364.chromosome_1:469429-469530,31056
58,brenthis_ino.SP_BI_364.chromosome_1,469461,brenthis_ino.SP_BI_364.chromosome_1:469429-469530,31057
59,brenthis_ino.SP_BI_364.chromosome_1,469481,brenthis_ino.SP_BI_364.chromosome_1:469429-469530,31058
60,brenthis_ino.SP_BI_364.chromosome_1,469487,brenthis_ino.SP_BI_364.chromosome_1:469429-469530,31059


In [130]:
positions = callset.callset_positions_df
positions["gt_idx"] = positions.index

for row in range(len(blocks)):
    

blocks_snps_df = pd.concat([positions[positions["pos"].between(blocks.iloc[row, 1],
                                                                blocks.iloc[row, 2])]
                                                                  for row in range(len(blocks))])

IndentationError: expected an indented block after 'for' statement on line 4 (2163215902.py, line 7)

In [None]:
blocks

Unnamed: 0,chr,start,end,block_id
35,brenthis_ino.SP_BI_364.chromosome_1,73671,73772,brenthis_ino.SP_BI_364.chromosome_1:73671-73772
50,brenthis_ino.SP_BI_364.chromosome_1,143980,144081,brenthis_ino.SP_BI_364.chromosome_1:143980-144081
127,brenthis_ino.SP_BI_364.chromosome_1,237688,237789,brenthis_ino.SP_BI_364.chromosome_1:237688-237789
187,brenthis_ino.SP_BI_364.chromosome_1,280200,280301,brenthis_ino.SP_BI_364.chromosome_1:280200-280301
359,brenthis_ino.SP_BI_364.chromosome_1,352222,352323,brenthis_ino.SP_BI_364.chromosome_1:352222-352323
...,...,...,...,...
483342,brenthis_ino.SP_BI_364.scaffold_2,1972,2073,brenthis_ino.SP_BI_364.scaffold_2:1972-2073
483447,brenthis_ino.SP_BI_364.scaffold_5,12075,12176,brenthis_ino.SP_BI_364.scaffold_5:12075-12176
483477,brenthis_ino.SP_BI_364.scaffold_5,45474,45575,brenthis_ino.SP_BI_364.scaffold_5:45474-45575
483621,brenthis_ino.SP_BI_364.scaffold_8,40533,40634,brenthis_ino.SP_BI_364.scaffold_8:40533-40634


In [None]:
callset.callset_positions_df

Unnamed: 0,chr,pos,gt_idx
0,brenthis_ino.SP_BI_364.chromosome_1,7731,0
1,brenthis_ino.SP_BI_364.chromosome_1,7817,1
2,brenthis_ino.SP_BI_364.chromosome_1,8099,2
3,brenthis_ino.SP_BI_364.chromosome_1,8157,3
4,brenthis_ino.SP_BI_364.chromosome_1,8223,4
...,...,...,...
33333,brenthis_ino.SP_BI_364.chromosome_1,499408,33333
33334,brenthis_ino.SP_BI_364.chromosome_1,499460,33334
33335,brenthis_ino.SP_BI_364.chromosome_1,499498,33335
33336,brenthis_ino.SP_BI_364.chromosome_1,499509,33336


In [None]:
df = callset.callset_positions_df

df.merge(blocks, how="left", on=["chr", ])

out = blocks.merge(df, how='left', on="chr") \
         .query('pos.between(`start`, `end`)')

In [131]:
def block_snps(callset, blocks):
    """Merge block and callset information to get dataframe of SNPs and which block they belong to"""
    return blocks.merge(callset.callset_positions_df, how='left', on="chr") \
         .query('pos.between(`start`, `end`)')

In [132]:
df = block_snps(callset, blocks)

In [133]:
assert (df.columns == [
            "chr", "start", "end", "block_id", "pos", "gt_idx"]).all()

In [134]:
df.columns

Index(['chr', 'start', 'end', 'block_id', 'pos', 'gt_idx'], dtype='object')

In [145]:
block_snps_df = get_block_snps(callset, blocks)

## (3) Use GT information to make a DF of calls per block

In [147]:
def block_callset(callset, gt_idxs, ploidy=2):
    """Get the callset for a block defined by GT indices, divided by haplotype"""
    block_calls = pd.DataFrame(list(itertools.product(callset.samples, list(range(ploidy)))),
                        columns=["sample", "hap"])
    
    for idx in gt_idxs:
        block_calls[idx] = callset.gt[idx].flatten()

    return block_calls.dropna()

block_callset_df(callset, list(range(1595, 1598)), 2)

Unnamed: 0,sample,hap,1595,1596,1597
0,SE_BI_1495,0,0,0,0
1,SE_BI_1495,1,0,0,0
2,FR_BI_1497,0,0,0,0
3,FR_BI_1497,1,0,0,1
4,FR_BD_1329,0,0,1,1
5,FR_BD_1329,1,0,1,1
6,RS_BI_1496,0,0,0,0
7,RS_BI_1496,1,1,0,1
8,RO_BD_956,0,0,1,1
9,RO_BD_956,1,0,1,1


In [148]:
def n_segr_sites(block_callset):
    """Count segregating sites between samples of length 2 from a block callset"""
    return sum(block_callset.iloc[0, 2:] != block_callset.iloc[1, 2:])

In [149]:
def segregating_sites_spectrum(callset, block_snps, samples_map_df, blocklen, ploidy=2, sampling_probabilities=[0.25, 0.25, 0.5]):

    rng = np.random.default_rng()

    populations = samples_map_df["population"].unique()
    pop1 = list(samples_map_df["sample"]
                [samples_map_df["population"] == populations[0]])
    pop2 = list(samples_map_df["sample"]
                [samples_map_df["population"] == populations[1]])

    sss = np.zeros(shape=(3, blocklen))

    for block in block_snps["block_id"].unique():
        block_gt_idxs = list(
            block_snps[block_snps["block_id"] == block]["gt_idx"])
        block_callset_df = block_callset(callset,
                                         block_gt_idxs,
                                         ploidy=ploidy)

        sampling_state = int(np.where(rng.multinomial(
            n=1, pvals=sampling_probabilities) == 1)[0]) + 1

        if sampling_state == 1:
            block_sss = n_segr_sites(
                block_callset_df[block_callset_df["sample"].isin(pop1)].sample(n=2))
        elif sampling_state == 2:
            block_sss = n_segr_sites(
                block_callset_df[block_callset_df["sample"].isin(pop2)].sample(n=2))
        else:
            assert sampling_state == 3
            block_sss = n_segr_sites(pd.concat([block_callset_df[block_callset_df["sample"].isin(pop1)].sample(n=1),
                                     block_callset_df[block_callset_df["sample"].isin(pop2)].sample(n=2)]))

        sss[sampling_state-1, int(block_sss)
            ] = sss[sampling_state-1, int(block_sss)] + 1

    return sss


In [150]:
samples_map_df = pd.DataFrame({"sample": callset.samples,
                               "population": ["BI", "BI", "BD", "BI", "BD", "BI", "BI", "BD", "BI", "BD", "BD", "BD", "BD"]})

In [152]:
s3 = segregating_sites_spectrum(callset, block_snps_df, samples_map_df=samples_map_df, blocklen=100)

In [154]:
np.sum(s3)

7.0

In [141]:
s3 = segregating_sites_spectrum(
    callset,
    snp_positions=get_snp_positions(callset["variants/CHROM"][:], pos=callset["variants/POS"][:], blocks=blocks),
    ploidy=2,
    sampling_probabilities=[0.25, 0.25, 0.5],
    samples_df=samples_df,
    blocklen=100
)

TypeError: 'CallSet' object is not subscriptable

In [13]:
s3

array([[444., 263., 130., 104.,  57.,  24.,  23.,  11.,  12.,   2.,   3.,
          1.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.],
       [429., 266., 138.,  77.,  46.,  20.,  19.,  15.,   9.,   1.,   2.,
          0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0

In [None]:
def from_gff3_vcf(gff3_path,
                  vcf_path,
                  zarr_path,
                  samples_map,
                  blocklen=100,
                    ld_dist_bp=1000,
                    ploidy=2,
                    sampling_probabilities=[0.25, 0.25, 0.5]):
    
    """Preprocess a GFF3 annotation and VCF file to segregating sites spectrum."""
    
    blocks = gff3_to_blocks(gff3_path, blocklen=blocklen, ld_dist_bp=ld_dist_bp)
    callset = CallSet(vcf_path, zarr_path)
    block_snps_df = block_snps(callset, blocks)

    samples_map_df = pd.read_csv(samples_map)

    s3 = segregating_sites_spectrum(callset,
                                    block_snps=block_snps_df,
                                    samples_map_df=samples_map_df,
                                    blocklen=blocklen,
                                    ploidy=ploidy,
                                    sampling_probabilities=sampling_probabilities)
    
    return s3