In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

### Primer rebalancing

In this notebook, we will use the coverage calculations from a MiSeq run to 'rebalance' the primer pools, so that we have more even coverage. This is according to T112_LAB_SOP011_GBS_Testing_&_Rebalancing_V6.docx from GSU. 

In [2]:
# Load the sample metadata
metadata = pd.read_csv("../../config/metadata.tsv", sep="\t")
metadata.shape

(672, 9)

In [81]:
## Load the coverage data for each sample
# Take the mean where we have multiple target SNPs on a given amplicon

dfs =[]
for sample in tqdm(metadata.sampleID):
    df = pd.read_csv(f"../../results/coverage/{sample}.regions.bed.gz", sep="\t", header=None, names=['contig', 'start', 'end', 'amplicon', 'depth'])
    df = df.groupby('amplicon').agg({'depth':'mean'}).reset_index().sort_values('amplicon')
    dfs.append(df.assign(sample=sample))
    
dfs = pd.concat(dfs)

# Remove samples that are negative controls
dfs = dfs.query("~sample.str.contains('negative')", engine='python').reset_index(drop=True)
dfs = dfs.query("~sample.str.contains('Negative')", engine='python').reset_index(drop=True)
dfs = dfs.query("~sample.str.contains('random')", engine='python').reset_index(drop=True)
dfs.shape

  0%|          | 0/672 [00:00<?, ?it/s]

(54202, 3)

Convert the dataframe to amplicons x samples depth table. 

In [5]:
depth_df = dfs.pivot(columns='sample', index='amplicon', values='depth')
depth_df.head(2)

sample,Calvin_01,Calvin_02,Calvin_03,Calvin_04,Calvin_05,Calvin_06,Calvin_07,Calvin_08,Calvin_09,Calvin_10,...,VK7_dead_34,VK7_dead_34_dil,VK7_dead_35,VK7_dead_36,VK7_dead_37,VK7_dead_38,VK7_dead_39,VK7_dead_40,VK7_dead_41,VK7_dead_42
amplicon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agam_1,68.0,19.0,50.0,70.0,25.0,32.0,8.0,74.0,28.0,36.0,...,61.0,12.0,157.0,78.0,101.0,124.0,158.0,126.0,79.0,89.0
Agam_10,222.0,40.0,174.0,198.0,109.0,75.0,10.0,226.0,49.0,130.0,...,86.0,11.0,236.0,114.0,171.0,155.0,234.0,204.0,150.0,202.0


Calculate the total reads per amplicon, and the total reads per sample.

In [84]:
depth_df.to_excel("sample_amplicon_depth.xlsx")

In [7]:
tot_per_amplicon = depth_df.sum(axis=1)
tot_per_sample = depth_df.sum(axis=0)

In [12]:
# sort the dataframe by total depth per amplicon and sample. Not necessary. 

sample_order = tot_per_sample.sort_values().to_frame().reset_index()['sample'].to_list()
amplicon_order = tot_per_amplicon.sort_values().to_frame().reset_index()['amplicon'].to_list()
depth_df = depth_df.loc[amplicon_order, sample_order]
depth_df

sample,Siaya_Delta_Dead_90,Siaya_Delta_Dead_72,Siaya_Delta_Dead_37,Siaya_Delta_Dead_49,Siaya_Delta_Dead_48,Siaya_Delta_Dead_74,Siaya_Delta_Alive_60,GM_96,Siaya_Delta_Alive_81,Siaya_Delta_Dead_83,...,GH_57,VK7_alive_30,Siaya_Delta_Alive_56,GH_58,VK7_dead_03,Siaya_Delta_Alive_53,GH_25,GH_67,GH_33,GH_79
amplicon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agam_39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
Agam_82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,2.0,6.0
Agam_81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,2.0,6.0
Agam_62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,18.0,0.0,2.0,22.0,2.0,6.0,10.0,2.0
Agam_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22.0,29.0,46.0,23.0,28.0,40.0,28.0,36.0,39.0,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Agam_19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,...,473.0,505.0,658.0,486.0,582.0,702.0,422.0,752.0,810.0,590.0
Agam_35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,490.0,517.0,578.0,550.0,488.0,577.0,738.0,681.0,736.0,574.0
Agam_75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,1.0,...,750.0,785.0,1330.0,993.0,792.0,1501.0,1013.0,1208.0,1506.0,1207.0
Agam_48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,...,1318.0,1096.0,1198.0,1207.0,1100.0,1223.0,1035.0,1540.0,1558.0,1213.0


Divide each value in the amplicon x sample table by the total reads per amplicon to get the fraction of amplicon/target reads. 

In [18]:
fraction_df = depth_df.divide(tot_per_sample, axis=1)
fraction_df

sample,Calvin_01,Calvin_02,Calvin_03,Calvin_04,Calvin_05,Calvin_06,Calvin_07,Calvin_08,Calvin_09,Calvin_10,...,VK7_dead_34,VK7_dead_34_dil,VK7_dead_35,VK7_dead_36,VK7_dead_37,VK7_dead_38,VK7_dead_39,VK7_dead_40,VK7_dead_41,VK7_dead_42
amplicon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agam_39,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000056,0.000000,0.000090,0.000077,0.000000,0.000079,0.000098,0.000078
Agam_82,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000535,0.000000,0.000000,0.000501,0.000090,0.000308,0.000000,0.000315,0.000391,0.000621
Agam_81,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000535,0.000000,0.000000,0.000501,0.000090,0.000308,0.000000,0.000315,0.000391,0.000621
Agam_62,0.000954,0.002026,0.000656,0.001020,0.001268,0.001437,0.000377,0.000364,0.000000,0.000309,...,0.000000,0.000000,0.000222,0.000251,0.000179,0.000308,0.000130,0.000000,0.000391,0.000155
Agam_7,0.002291,0.001013,0.001094,0.001530,0.000846,0.001676,0.000000,0.001639,0.000549,0.002010,...,0.001872,0.000000,0.001613,0.002381,0.002242,0.005475,0.002727,0.001734,0.002055,0.002173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Agam_19,0.027678,0.036967,0.028770,0.030692,0.037203,0.019872,0.003391,0.028417,0.020037,0.029991,...,0.018457,0.012404,0.020241,0.022176,0.024576,0.020202,0.024608,0.022146,0.027693,0.024365
Agam_35,0.030351,0.040005,0.028879,0.027429,0.042277,0.027773,0.035980,0.033244,0.026625,0.032929,...,0.028221,0.095586,0.020797,0.022928,0.018298,0.017195,0.016751,0.016078,0.024464,0.031426
Agam_75,0.072059,0.071908,0.064103,0.055267,0.083708,0.047166,0.007912,0.069493,0.041721,0.058901,...,0.034641,0.030646,0.032808,0.041471,0.034981,0.035623,0.038177,0.037830,0.041099,0.039884
Agam_48,0.058602,0.062286,0.066729,0.054859,0.057919,0.045969,0.007158,0.054738,0.045564,0.051171,...,0.045474,0.043780,0.045765,0.046232,0.047628,0.040712,0.045125,0.050913,0.045796,0.045083


Find the median target fraction across all samples.

In [19]:
med_read_fractions = fraction_df.median(axis=1)
med_read_fractions

amplicon
Agam_39    0.000000
Agam_82    0.000000
Agam_81    0.000000
Agam_62    0.000180
Agam_7     0.001211
             ...   
Agam_19    0.027410
Agam_35    0.033191
Agam_75    0.050939
Agam_48    0.053053
Agam_28    0.070450
Length: 82, dtype: float64

And the total sum of the median read fractions...

In [22]:
med_read_fraction_sum = med_read_fractions.sum()
med_read_fraction_sum

0.9348875364563011

Scale it so they all add up to 1. 

In [24]:
scaled_med_read_fractions = med_read_fractions / med_read_fraction_sum
scaled_med_read_fractions

amplicon
Agam_39    0.000000
Agam_82    0.000000
Agam_81    0.000000
Agam_62    0.000193
Agam_7     0.001295
             ...   
Agam_19    0.029319
Agam_35    0.035502
Agam_75    0.054487
Agam_48    0.056748
Agam_28    0.075356
Length: 82, dtype: float64

Take the scaled median read fraction for each amplicon, and put it to the power of -0.561 (the magic number). 

In [25]:
primer_volumes = scaled_med_read_fractions**-0.561
primer_volumes

amplicon
Agam_39           inf
Agam_82           inf
Agam_81           inf
Agam_62    121.314801
Agam_7      41.684313
              ...    
Agam_19      7.243237
Agam_35      6.505886
Agam_75      5.116131
Agam_48      5.000755
Agam_28      4.265162
Length: 82, dtype: float64

The pool weightings can be interpreted directly as the volumes to add of each target’s primer pair in a pool, however in order to reduce inaccuracies associated with pipetting small volumes it is prudent to scale all the weightings such that the minimum weight in the pool is 1 (so that the minimum volume pipetted is 1µl); 

In [28]:
primer_volumes = primer_volumes / primer_volumes.min()#
primer_volumes

amplicon
Agam_39          inf
Agam_82          inf
Agam_81          inf
Agam_62    28.443186
Agam_7      9.773207
             ...    
Agam_19     1.698232
Agam_35     1.525355
Agam_75     1.199516
Agam_48     1.172465
Agam_28     1.000000
Length: 82, dtype: float64

Primer pairs which generate only very small read fractions can be overweighted by the primer rebalancing algorithm, which leads to them dominating the reads from the resultant rebalanced pool. Given that very poorly performing primers may hint at a design issue, it has been empirically determined to be prudent to ‘clip’ the maximum pool weighting to 10x the minimum. Given that the minimum pool weight has been scaled to 1, the maximum volume of primer pair that can be added is 10µl.

In [30]:
primer_volumes = np.clip(primer_volumes, 0, 10)
primer_volumes

amplicon
Agam_39    10.000000
Agam_82    10.000000
Agam_81    10.000000
Agam_62    10.000000
Agam_7      9.773207
             ...    
Agam_19     1.698232
Agam_35     1.525355
Agam_75     1.199516
Agam_48     1.172465
Agam_28     1.000000
Length: 82, dtype: float64

Calculate the sum of the pool weightings, and the interquartile mean pool weighting; we use the interquartile mean rather than the arithmetic mean so as to allow us to ignore the effect of clipping any overweighted targets to 10x the minimum

In [31]:
primer_volumes.sum()

328.06275594544843

In [33]:
np.percentile(primer_volumes, 50)

3.573552228161946

In [35]:
primer_volumes

amplicon
Agam_39    10.000000
Agam_82    10.000000
Agam_81    10.000000
Agam_62    10.000000
Agam_7      9.773207
             ...    
Agam_19     1.698232
Agam_35     1.525355
Agam_75     1.199516
Agam_48     1.172465
Agam_28     1.000000
Length: 82, dtype: float64

In [42]:
primer_volumes

amplicon
Agam_39    10.000000
Agam_82    10.000000
Agam_81    10.000000
Agam_62    10.000000
Agam_7      9.773207
             ...    
Agam_19     1.698232
Agam_35     1.525355
Agam_75     1.199516
Agam_48     1.172465
Agam_28     1.000000
Length: 82, dtype: float64

In [48]:
from scipy import stats

#calculate 50% trimmed mean
iqm = stats.trim_mean(primer_volumes, 0.25)
iqm

3.4309857561698878

Calculate (IQM pool weighting/sum of weightings) and multiply by 250000nM (the concentration of the primer pairs in the source plate) to obtain the ‘central’ primer concentration for the pool in nM.

In [49]:
central_primer_conc = iqm/primer_volumes.sum() * 250_000
central_primer_conc

2614.580361524188

Calculate the dilution factor required to dilute this pool to 40nM working concentration as (central pool concentration/40). 

In [51]:
dilution_factor_40nm = central_primer_conc/40
dilution_factor_40nm

65.3645090381047

Calculate the amount of diluent needed to add to the primer pool to get 40nm. 

In [52]:
(primer_volumes.sum()*dilution_factor_40nm)-primer_volumes.sum()

21115.598220116353

### Sanity check that low depth = higher required primer volumes

In [68]:
a = primer_volumes.to_frame().sort_values('amplicon').rename(columns={0:'vols'})
b = tot_per_amplicon.to_frame().sort_values('amplicon').rename(columns={0:'depth'})

df = pd.concat([a,b], axis=1)

In [69]:
df

Unnamed: 0_level_0,vols,depth
amplicon,Unnamed: 1_level_1,Unnamed: 2_level_1
Agam_1,3.649756,26942.0
Agam_10,2.276589,65200.0
Agam_11,4.392923,22965.0
Agam_12,2.053565,74757.0
Agam_13,6.234016,10498.0
...,...,...
Agam_8,6.595224,10039.0
Agam_80,3.839870,27824.0
Agam_81,10.000000,533.0
Agam_82,10.000000,527.0
