# Organize variants selected from select_crispey3_library_variants_2020 notebook into pools
The goal is to assign each variant with a pool number, and that number will be used when assigning barcodes to guide-donors and grouping the oligos into pools in the final CRISPEY3 library construction.

There are multiple sets going into the final library, and each set will have slightly different grouping and randomization requirements.

Broadly speaking, here are some general principles:
- Keep oligos targeting the same variant together. This way, we control the pool covariate and can better account for guide-specific differences per variant.
- Within each variant set, avoid clustering variants in the same genomic region together. This avoids cloning issues (PCR mispriming, template swaps in Gibson assembly, etc.), and also controls for pool-specific batch effects when comparing fitness effects across variants.
    - This negates a gene-based grouping approach, which would make for convenient organization and effective utilization of pools to do gene-specific small growth competition experiments.
    - Sets such as the ergosterol pathway set and GxG sets must utilize whole-set variant randomization to avoid pool effects being associated with each gene.
- An alternative approach would be to pool variants by gene. The GxE set is already somewhat grouped by gene because although the variants are currently grouped by condition, genes are often condition-specific in their testing and so variants in the same gene will inevitably end up in the same few pools. Some randomization here might still be possible, but not if there are too few pools available, especially for oligos used in multiple conditions
- When assigning variants to each pool, take into account the number of oligos generated for each variant so that each pool contains the appropriate number of oligos (118, plus 3 additional technical oligos)

In [1]:
import os, random, vcf
import numpy as np
import pandas as pd

print("Number of available cores:", len(os.sched_getaffinity(0)))

# set working directory
working_dir = "/home/users/rang/crispey3/library_design/Input/"
os.chdir(working_dir)

# setting the random seed
random.seed(1)

Number of available cores: 20


In [2]:
def sort_variants_to_pools(vars_df, assignment_order=[], grouping_col = None, num_of_pools_per_group={}, 
                           spillover_assignment={}, max_pool_size=1, first_pool=1, ):
    
    # for storing variant ID of spillover variants
    spillover_vars = {g : [] for g in assignment_order}
#     # add discard list
#     spillover_vars['discard'] = []
    
    # iterate over each group and assign to pools
    for group in assignment_order:
        # get subset of variants belonging to group
        # if column to group by provided, select variant subset by column. Otherwise, filter by variant ID
        if grouping_col:
            subset = vars_df.loc[vars_df[grouping_col] == group]
        else:
            subset = vars_df.filter(like=group, axis=0)
        # sort variants in descending order of number of oligos, then randomize variants within each group of a certain number of oligos
        # this enables even, efficient packing with randomization across pools
        subset = subset.groupby('num_of_oligos').apply(lambda x: x.sample(frac=1, random_state=1)).reset_index(level=0, drop=True)[::-1]
        
        
        # get number of pools to use for group
        num_of_pools = num_of_pools_per_group[group]
        # assign oligos to spillover if no pools set for group
        if num_of_pools == 0:
#             print("Excess oligos for {}: {}".format(group, subset.num_of_oligos.sum())) # debugging
            spillover_vars[spillover_assignment[group]] += subset.index.tolist()
            continue


        # define current pool sizes
        cur_pool_sizes = [0] * num_of_pools
        # add spillover variants to first pool(s) in group
        # note: packing is done by max-ing out first pool, then moving to next pool, and doing so until all spillover variants are assigned
        if len(spillover_vars[group]) > 0:
            spillover_pool_num = first_pool
            for v, row in vars_df.loc[spillover_vars[group]].iterrows():
                # move to next spillover pool number if pool has reached capacity
                if cur_pool_sizes[spillover_pool_num-first_pool] + row['num_of_oligos'] > max_pool_size:
                    spillover_pool_num += 1
                    
                # assign each spillover variant to pool number
                vars_df.loc[v, 'pool_num'] = spillover_pool_num
                # increase pool size by variant's number of oligos
                cur_pool_sizes[spillover_pool_num-first_pool] += row['num_of_oligos']

                
        # add all other variants to pools in group
        # variants are assigned to spread oligos out, ensuring similar group sizes across pools
        excess = 0
        for v, row in subset.iterrows():
            # find smallest current pool
            i = np.argmin(cur_pool_sizes)
            # assign variant to pool if number of oligos in pool is within max pool size
            if cur_pool_sizes[i] + row['num_of_oligos'] <= max_pool_size:
                vars_df.loc[v, 'pool_num'] = first_pool+i
                cur_pool_sizes[i] += row['num_of_oligos']
            # if current pools have insufficient space, send to group assigned for spillover
            else:
                excess +=row['num_of_oligos']
                spillover_vars[spillover_assignment[group]].append(v)
        
#         # (debugging) print spillover variants
#         if group in spillover_assignment.keys():
#             print("Excess oligos for {}: {}".format(group, excess))
        
        # once assigned, advance first_pool for next group
        first_pool += num_of_pools
    
#     # remove discarded variants
#     vars_df = vars_df.loc[~vars_df.index.isin(spillover_vars['discard'])]
#     # identify number of variants discarded
#     print("Variants discarded:", spillover_vars['discard'])

    vars_df['pool_num'] = vars_df['pool_num'].astype(int)

    return vars_df

## Ergosterol set

### We should be able to fit all ergosterol variants in 34 pools
- ERG: 21 pools, 117-118 oligos each
- EGE: 4 pools, 117-118 oligos each
- EGD: 1 pool, 118 oligos each
- EGC: 2 pools, 110-111 oligos each
- EGB: 1 pool, 117 oligos each
- EGA: 1 pool, 117 oligos each
- EG9: 1 pool, 118 oligos each
- EG8: 3 pools, 113-114 oligos each

See code for spillover assignment

### Variant ID prefix and which genomes they can be edited in:
- ERG: BY | RM | YJM | YPS
- EGE: BY | RM | YJM | ---
- EGD: BY | RM | --- | YPS
- EGC: BY | RM | --- | ---
- EGB: BY | --- | YJM | YPS
- EGA: BY | --- | YJM | ---
- EG9: BY | --- | --- | YPS
- EG8: BY | --- | --- | ---

In [3]:
ergosterol_vars_file = "/home/users/rang/scratch/yeast/ergosterol/ergosterol_variants_final.txt"
ergosterol_vars = pd.read_csv(ergosterol_vars_file, sep='\t', index_col=0)
ergosterol_vars['num_of_oligos'] = ergosterol_vars['num_of_oligos'].astype(int)

# assign ergosterol variants to pools
ergosterol_vars = sort_variants_to_pools(vars_df = ergosterol_vars, 
                                         assignment_order = ['ERG','EGE','EGD','EGC','EGB','EGA','EG9','EG8'],
                                         num_of_pools_per_group = {'ERG':21, 'EGE':4, 'EGD':1, 'EGC':2, 'EGB':1, 'EGA':1, 'EG9':1, 'EG8':3},
                                         spillover_assignment = {'ERG':'EGE', 'EGE':'EGC', 'EGD':'EGC', 'EGC':'EG8', 'EGB':'EGA', 'EGA':'EG8', 'EG9':'EG8'},
                                         max_pool_size = 118, first_pool = 1)

display(ergosterol_vars)
# check number of oligos in each pool
display(ergosterol_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", ergosterol_vars.num_of_oligos.sum())
print("Total number of variants:", len(ergosterol_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERG_00003,V,237122,G,A,9,2022,0.004451,3,2382|1631|1338|448|2910,unidirectional_promoter,...,SAH1,YER043C,upstream_gene_variant,3.0,ERG28,YER044C,downstream_gene_variant,448.0,2,13
ERG_00004,V,237136,T,C,26,2022,0.013000,17,2396|1645|1324|434|2896,unidirectional_promoter,...,SAH1,YER043C,upstream_gene_variant,17.0,ERG28,YER044C,downstream_gene_variant,434.0,2,21
EGE_00008,V,237191,T,TG,661,2022,0.327000,72,2451|1700|1268|378|2840,unidirectional_promoter,...,SAH1,YER043C,upstream_gene_variant,72.0,ERG28,YER044C,downstream_gene_variant,378.0,2,25
EGE_00009,V,237191,T,TA,6,2022,0.002967,72,2451|1700|1268|378|2840,unidirectional_promoter,...,SAH1,YER043C,upstream_gene_variant,72.0,ERG28,YER044C,downstream_gene_variant,378.0,2,22
EGE_00010,V,237196,A,T,69,2022,0.034000,77,2456|1705|1264|374|2836,unidirectional_promoter,...,SAH1,YER043C,upstream_gene_variant,77.0,ERG28,YER044C,downstream_gene_variant,374.0,2,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EGA_00039,XVI,499649,G,A,4,2022,0.001978,3868|16,4690|1026|357|1930|4403,unidirectional_promoter,...,ERG10,YPL028W,downstream_gene_variant,357.0,SMA1,YPL027W,upstream_gene_variant,16.0,2,30
EGA_00040,XVI,499650,G,C,5,2022,0.002473,3867|15,4689|1025|358|1931|4404,unidirectional_promoter,...,ERG10,YPL028W,downstream_gene_variant,358.0,SMA1,YPL027W,upstream_gene_variant,15.0,2,30
EGA_00041,XVI,499655,G,A,4,2022,0.001978,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0,2,30
EGA_00042,XVI,499655,G,T,35,2022,0.017000,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0,2,32


pool_num
1     118
2     118
3     118
4     118
5     118
6     118
7     118
8     118
9     117
10    117
11    117
12    117
13    117
14    117
15    117
16    117
17    117
18    117
19    117
20    117
21    117
22    118
23    118
24    118
25    117
26    118
27    110
28    111
29    117
30    117
31    118
32    114
33    114
34    113
Name: num_of_oligos, dtype: int64

Total number of oligos: 3968
Total number of variants: 1580


## GXG set

### We should be able to fit all GxG variants in 43 pools
- GXG: 43 pools, 114-116 oligos each

### Variant ID prefix and which genomes they can be edited in:
- GXG: BY | RM | YJM | YPS


In [4]:
gxg_vars_file = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/gxg_variants_final.txt"
gxg_vars = pd.read_csv(gxg_vars_file, sep='\t', index_col=0)
gxg_vars['num_of_oligos'] = gxg_vars['num_of_oligos'].astype(int)

# assign gxg variants to pools
gxg_vars = sort_variants_to_pools(vars_df = gxg_vars, 
                                  assignment_order = ['GXG'],
                                  num_of_pools_per_group = {'GXG':43},
                                  spillover_assignment = {},
                                  max_pool_size = 118, first_pool = 35)

display(gxg_vars)
# check number of oligos in each pool
display(gxg_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", gxg_vars.num_of_oligos.sum())
print("Total number of variants:", len(gxg_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos,assoc_gene,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00321,I,101530,G,C,4,2022,0.001978,385|1833,1662|4742|35,unidirectional_promoter,...,YAL025C,upstream_gene_variant,385.0,LTE1,YAL024C,downstream_gene_variant,35.0,3,"['YAL024C', 'YAL025C']",68
GXG_00329,I,101816,C,T,4,2020,0.001980,671|2119,1948|4456,missense_variant,...,,,,,,,,4,['YAL024C'],37
GXG_00332,I,101901,C,A,90,2022,0.045000,756|2204,2033|4371,synonymous_variant,...,,,,,,,,2,['YAL024C'],50
GXG_00339,I,102261,C,T,78,2020,0.039000,1116|2564,2393|4011,synonymous_variant,...,,,,,,,,2,['YAL024C'],76
GXG_00346,I,102699,G,A,310,2016,0.154000,1554|3002,2831|3573,synonymous_variant,...,,,,,,,,3,['YAL024C'],41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50437,XVI,899389,G,A,6,2018,0.002973,3625|805|1366|2655,2385,synonymous_variant,...,,,,,,,,2,['YPR181C'],72
GXG_50440,XVI,899419,A,G,196,2014,0.097000,3655|775|1336|2625,2415,synonymous_variant,...,,,,,,,,3,['YPR181C'],66
GXG_50446,XVI,899521,G,C,3,2018,0.001487,3757|673|1234|2523,2517,synonymous_variant,...,,,,,,,,2,['YPR181C'],43
GXG_50447,XVI,899679,A,G,68,2010,0.034000,3915|12|515|1076|2365,2675,bidirectional_promoter,...,YPR181C,upstream_gene_variant,12.0,SMX3,YPR182W,upstream_gene_variant,515.0,4,['YPR181C'],47


pool_num
35    115
36    115
37    115
38    115
39    115
40    115
41    115
42    115
43    115
44    115
45    115
46    116
47    116
48    116
49    116
50    116
51    116
52    116
53    116
54    116
55    116
56    116
57    116
58    116
59    116
60    116
61    116
62    116
63    116
64    116
65    116
66    116
67    116
68    114
69    114
70    114
71    114
72    114
73    114
74    114
75    114
76    114
77    114
Name: num_of_oligos, dtype: int64

Total number of oligos: 4957
Total number of variants: 2000


## Epistasis validation set

### We should be able to fit known effects variants in 6 pools
- TDH, VAL: 6 pools, 115-116 oligos each

### Variant ID prefix and which genomes they can be edited in:
- TDH: BY | --- | --- | ---
- VAL: BY | RM | YJM | YPS

In [5]:
epival_vars_file = "/home/users/rang/scratch/yeast/epistasis/epival_variants_final.txt" # also VCF file epival_variants_final.vcf
epival_vars = pd.read_csv(epival_vars_file, sep='\t')

# calculate number of oligos per variant, add info to column and adjust to reflect one variant per row
epival_vars['num_of_oligos'] = epival_vars['var_id'].map(epival_vars.groupby('var_id').size())
# convert epival_vars to simplified dataframe aligned with the other sets
epival_vars = epival_vars[['var_id', 'chrom', 'SNP_chr_pos', 'REF', 'ALT', 'num_of_oligos']].rename(columns={'chrom':'CHROM', 'SNP_chr_pos':'POS'}).set_index('var_id').drop_duplicates()
epival_vars['ALT'] = epival_vars['ALT'].apply(lambda x: x[1:-1])

# assign epival variants to pools
epival_vars = sort_variants_to_pools(vars_df = epival_vars, 
                                  assignment_order = ['TDH', 'VAL'],
                                  num_of_pools_per_group = {'TDH':0, 'VAL':6},
                                  spillover_assignment = {'TDH':'VAL'},
                                  max_pool_size = 118, first_pool = 78)

display(epival_vars)
# check number of oligos in each pool
display(epival_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", epival_vars.num_of_oligos.sum())
print("Total number of variants:", len(epival_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,num_of_oligos,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TDH_00001,VII,884068,C,T,2,78
TDH_00002,VII,884073,G,A,2,78
TDH_00003,VII,884105,C,T,3,78
TDH_00004,VII,884107,G,A,3,78
TDH_00005,VII,884108,G,A,3,78
...,...,...,...,...,...,...
VAL_00239,XVI,501421,T,C,3,78
VAL_00240,XVI,563805,G,A,2,80
VAL_00241,XVI,798797,A,C,2,79
VAL_00242,XVI,820797,C,T,4,81


pool_num
78    116
79    115
80    115
81    115
82    115
83    116
Name: num_of_oligos, dtype: int64

Total number of oligos: 692
Total number of variants: 257


## GXE set

### We should be able to fit all GxE variants in 94 pools
Oligos will be grouped according to condition (see assoc_traits_str)
- GXE: 94 pools


In [6]:
gxe_vars_file = "/home/users/rang/scratch/yeast/GxE/gxe_variants_final.txt"
gxe_vars = pd.read_csv(gxe_vars_file, sep='\t', index_col=0)
gxe_vars['num_of_oligos'] = gxe_vars['num_of_oligos'].astype(int)

# assign gxe variants to pools
gxe_vars = sort_variants_to_pools(vars_df = gxe_vars, 
                                  assignment_order = ['Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2',
                                                      'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Fructose;;1',
                                                      'Caffeine;15mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2',
                                                      'Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg/mL;2',
                                                      'Lithium_Chloride;100mM;2_Neomycin;5mg/mL;2',
                                                      'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2',
                                                      'Caffeine;15mM;2_Cobalt_Chloride;2mM;2',
                                                      'Caffeine;15mM;2_Neomycin;5mg/mL;2',
                                                      'Cobalt_Chloride;2mM;2_Fructose;;1',
                                                      'Fluconazole;100uM;2_Fructose;;1',
                                                      'Caffeine;15mM;2_Fructose;;1',
                                                      'Lithium_Chloride;100mM;2',
                                                      'Cobalt_Chloride;2mM;2',
                                                      'Fluconazole;100uM;2',
                                                      'Neomycin;5mg/mL;2',
                                                      'Fructose;;1',
                                                      'Caffeine;15mM;2'],
                                  
                                  grouping_col = 'assoc_traits_str',
                                  
                                  num_of_pools_per_group = {'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2':3,
                                                            'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Fructose;;1':0,
                                                            'Caffeine;15mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2':3,
                                                            'Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg/mL;2':1,
                                                            'Lithium_Chloride;100mM;2_Neomycin;5mg/mL;2':1,
                                                            'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2':1,
                                                            'Caffeine;15mM;2_Cobalt_Chloride;2mM;2':1,
                                                            'Caffeine;15mM;2_Neomycin;5mg/mL;2':0,
                                                            'Cobalt_Chloride;2mM;2_Fructose;;1':1,
                                                            'Fluconazole;100uM;2_Fructose;;1':0,
                                                            'Caffeine;15mM;2_Fructose;;1':0,
                                                            'Lithium_Chloride;100mM;2':14,
                                                            'Cobalt_Chloride;2mM;2':14,
                                                            'Fluconazole;100uM;2':13,
                                                            'Neomycin;5mg/mL;2':10,
                                                            'Fructose;;1':13,
                                                            'Caffeine;15mM;2':19},
                                                            
                                  spillover_assignment = {'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2':'Neomycin;5mg/mL;2',
                                                          'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Fructose;;1':'Cobalt_Chloride;2mM;2',
                                                          'Caffeine;15mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2':'Neomycin;5mg/mL;2',
                                                          'Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg/mL;2':'Cobalt_Chloride;2mM;2',
                                                          'Lithium_Chloride;100mM;2_Neomycin;5mg/mL;2':'Lithium_Chloride;100mM;2',
                                                          'Cobalt_Chloride;2mM;2_Fluconazole;100uM;2':'Fluconazole;100uM;2',
                                                          'Caffeine;15mM;2_Neomycin;5mg/mL;2':'Neomycin;5mg/mL;2',
                                                          'Cobalt_Chloride;2mM;2_Fructose;;1':'Cobalt_Chloride;2mM;2',
                                                          'Fluconazole;100uM;2_Fructose;;1':'Fluconazole;100uM;2',
                                                          'Caffeine;15mM;2_Fructose;;1':'Caffeine;15mM;2',
                                                          'Lithium_Chloride;100mM;2':'Caffeine;15mM;2',
                                                          'Fructose;;1':'Caffeine;15mM;2'},
                                  max_pool_size = 118, first_pool = 84)

display(gxe_vars)
# check number of oligos in each pool
display(gxe_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", gxe_vars.num_of_oligos.sum())
print("Total number of variants:", len(gxe_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,ACS,ADE,ADF,ADG,ADI,ADR,SACE_MAA,assoc_traits,assoc_traits_str,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...,85
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,0/0,,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...,84
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,0/0,0/0,0/0,0/0,1/1,1/1,0/0,"['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...,85
GXE_00027,I,192429,G,C,1506,1998,0.754000,12|190,4697|3618|173,bidirectional_promoter,...,1/1,,1/1,1/1,1/1,1/1,0/0,"['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...,86
GXE_00033,I,192466,AAAGGG,A,3,2020,0.001485,50|148,4735|3656|211,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17166,XVI,375077,T,C,3,2022,0.001484,425|4414,4043|2214|2156,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,109
GXE_17167,XVI,375078,C,A,8,2020,0.003960,424|4415,4042|2213|2157,stop_gained,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,109
GXE_17168,XVI,375080,G,A,31,2022,0.015000,422|4417,4040|2211|2159,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17188,XVI,375331,C,A,8,2020,0.003960,171|4668,3789|1960|162|2410,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90


pool_num
84     118
85     118
86     117
87     118
88     118
      ... 
173    117
174    117
175    117
176    118
177    118
Name: num_of_oligos, Length: 94, dtype: int64

Total number of oligos: 10983
Total number of variants: 4384


In [21]:
# gxe_vars.query('pool_num==90')
g

Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,ACS,ADE,ADF,ADG,ADI,ADR,SACE_MAA,assoc_traits,assoc_traits_str,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_17015,XVI,373309,C,T,5,2022,0.002473,2193|484|2646|3868,3982|388,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17016,XVI,373313,C,T,6,2022,0.002967,2189|480|2650|3872,3978|392,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17017,XVI,373314,G,C,6,2018,0.002973,2188|479|2651|3873,3977|393,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17018,XVI,373318,T,A,18,2022,0.008902,2184|475|2655|3877,3973|397,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17033,XVI,373506,T,C,10,2022,0.004946,1996|287|2843|4065,3785|585,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17034,XVI,373512,A,G,35,2020,0.017,1990|281|2849|4071,3779|591,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17042,XVI,373604,C,G,6,2022,0.002967,1898|189|2941|4163,3687|683,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17053,XVI,373707,C,T,98,2020,0.049,1795|86|3044|4266,3584|786,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17055,XVI,373710,G,A,10,2010,0.004975,1792|83|3047|4269,3581|789,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90
GXE_17058,XVI,373805,T,C,5,2022,0.002473,1697|3142|4364,3486|884,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neom...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...,90


## HSP90 set

### We should be able to fit all HSP90 variants in 7 pools
- HSP+HSX: 7 pools, 111-113 oligos each


In [7]:
hsp90_vars_file = "/home/users/rang/scratch/yeast/hsp90/hsp90_variants_final.txt"
hsp90_vars = pd.read_csv(hsp90_vars_file, sep='\t', index_col=0)
hsp90_vars['num_of_oligos'] = hsp90_vars['num_of_oligos'].astype(int)

# assign hsp90 variants to pools
hsp90_vars = sort_variants_to_pools(vars_df = hsp90_vars, 
                                    assignment_order = ['HS'],
                                    num_of_pools_per_group = {'HS':7},
                                    spillover_assignment = {},
                                    max_pool_size = 118, first_pool = 178)

display(hsp90_vars)
# check number of oligos in each pool
display(hsp90_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", hsp90_vars.num_of_oligos.sum())
print("Total number of variants:", len(hsp90_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0,2,179
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0,2,179
HSP_00006,XIII,631979,G,A,12,2018,0.005946,4171|376,3194|9|2710|4312,unidirectional_promoter,...,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,376.0,2,182
HSP_00007,XIII,631980,C,T,53,2018,0.026000,4172|375,3195|10|2709|4311,unidirectional_promoter,...,RTP1,YMR185W,downstream_gene_variant,10.0,HSC82,YMR186W,upstream_gene_variant,375.0,2,182
HSP_00008,XIII,631981,A,G,25,2020,0.012000,4173|374,3196|11|2708|4310,unidirectional_promoter,...,RTP1,YMR185W,downstream_gene_variant,11.0,HSC82,YMR186W,upstream_gene_variant,374.0,2,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSX_00327,XVI,98320,T,G,0,1996,0.000000,4864|2128|1116|2109|3258,3240,frameshift_variant,...,,,,,,,,,2,178
HSX_00329,XVI,98335,G,A,0,2012,0.000000,4819|2083|1071|2155|3304,3195,missense_variant,...,,,,,,,,,3,180
HSX_00330,XVI,98335,G,C,0,2012,0.000000,4818|2082|1070|2155|3304,3194,frameshift_variant,...,,,,,,,,,3,179
HSX_00334,XVI,98367,C,CT,0,2016,0.000000,4729|1993|981|2245|3394,3105,synonymous_variant,...,,,,,,,,,2,180


pool_num
178    113
179    111
180    111
181    111
182    111
183    112
184    112
Name: num_of_oligos, dtype: int64

Total number of oligos: 781
Total number of variants: 332


## Humanized yeast pilot

### We should be able to fit humanized yeast variants in 6 pools
Assign one pool per human gene to study
- DHFR: 185
- DPAGT1: 186
- NSDHL: 187
- PGK1: 188
- PKLR: 189
- UROS: 190

In [8]:
gene_to_pool = {'DHFR':185,
                'DPAGT1':186,
                'NSDHL':187,
                'PGK1':188,
                'PKLR':189,
                'UROS':190}

# read in all_SNPs_humanized_GG_9bp_OLIGO.tab, subset to variants of interest, count oligos and assign to pools

humanized_vars_file = "/home/users/rang/crispey3/humanized/Output/all_SNPs_humanized_combined_GG_9bp_OLIGO.tab"
humanized_vars = pd.read_csv(humanized_vars_file, sep='\t')

# calculate number of oligos per variant, add info to column and adjust to reflect one variant per row
humanized_vars['num_of_oligos'] = humanized_vars['var_id'].map(humanized_vars.groupby('var_id').size())
# convert humanized_vars to simplified dataframe aligned with the other sets
humanized_vars = humanized_vars[['var_id', 'chrom', 'SNP_chr_pos', 'REF', 'ALT', 'num_of_oligos']].rename(columns={'chrom':'CHROM', 'SNP_chr_pos':'POS'}).set_index('var_id').drop_duplicates()
humanized_vars['ALT'] = humanized_vars['ALT'].apply(lambda x: x[1:-1])

# assign humanized variants to pools by variant prefix, which indicates the gene of interest
humanized_vars['pool_num'] = [gene_to_pool[v.split('_')[0]] for v in humanized_vars.index]

display(humanized_vars)
# check number of oligos in each pool
display(humanized_vars.groupby('pool_num').num_of_oligos.sum())
print("Total number of oligos:", humanized_vars.num_of_oligos.sum())
print("Total number of variants:", len(humanized_vars))


Unnamed: 0_level_0,CHROM,POS,REF,ALT,num_of_oligos,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DHFR_10007,DHFR,457,A,T,2,185
DHFR_10008,DHFR,237,C,T,1,185
DHFR_10104,DHFR,186,G,C,1,185
DHFR_00019,chr5,79929796,G,C,3,185
DHFR_00059,chr5,79950223,C,T,4,185
...,...,...,...,...,...,...
UROS_00009,chr10,127477478,T,C,4,190
UROS_00010,chr10,127477480,G,C,4,190
UROS_00012,chr10,127477484,G,C,3,190
UROS_00020,chr10,127477526,G,A,4,190


pool_num
185     70
186    118
187    118
188    118
189    118
190    118
Name: num_of_oligos, dtype: int64

Total number of oligos: 660
Total number of variants: 223


## This leaves the following sets to assign to pools:
- Fitness ladder (ladder oligo, synonymous donor+guide, scramble donor+guide): 191
- Neutrals (CRISPEY1 neutral, donor+sgGFP, ladder scrambled donor+sgGFP): 192

# Write pool assignment of all variants to single file for reference

In [9]:
output_columns = ['CHROM', 'POS', 'REF', 'ALT', 'num_of_oligos', 'pool_num']
out_df = pd.concat([ergosterol_vars[output_columns], 
                    gxg_vars[output_columns], 
                    epival_vars[output_columns],
                    gxe_vars[output_columns],
                    hsp90_vars[output_columns],
                    humanized_vars[output_columns]])
display(out_df)

out_df.to_csv('crispey3_vars_pool_assignment.txt', sep='\t')

Unnamed: 0_level_0,CHROM,POS,REF,ALT,num_of_oligos,pool_num
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ERG_00003,V,237122,G,A,2,13
ERG_00004,V,237136,T,C,2,21
EGE_00008,V,237191,T,TG,2,25
EGE_00009,V,237191,T,TA,2,22
EGE_00010,V,237196,A,T,2,22
...,...,...,...,...,...,...
UROS_00009,chr10,127477478,T,C,4,190
UROS_00010,chr10,127477480,G,C,4,190
UROS_00012,chr10,127477484,G,C,3,190
UROS_00020,chr10,127477526,G,A,4,190


In [10]:
out_df.groupby('pool_num').num_of_oligos.sum().value_counts()

117    63
116    40
118    39
115    25
114    12
111     5
113     2
112     2
110     1
70      1
Name: num_of_oligos, dtype: int64

# Assemble all variants (except humanized yeast set) from separate VCF files into single VCF for oligo design pipeline

In [11]:
lib_design_vcf_file = 'all_variants_design_oligos.vcf'
template_vcf_file = '/home/users/rang/crispey3/initial_design/Input/gxg_variants_design_oligos_initial.vcf'
template_vcf = vcf.Reader(filename=template_vcf_file)

# open output file
lib_design_vcf = vcf.Writer(open(lib_design_vcf_file, 'w'), template_vcf)

vcf_paths_list = ['/home/users/rang/crispey3/initial_design/Input/ergosterol_variants_design_oligos_initial.vcf',
                  '/home/users/rang/crispey3/initial_design/Input/gxg_variants_design_oligos_initial.vcf',
                  '/home/users/rang/crispey3/initial_design/Input/epival_variants_final.vcf',
                  '/home/users/rang/crispey3/initial_design/Input/gxe_variants_design_oligos_initial.vcf',
                  '/home/users/rang/scratch/yeast/hsp90/hsp90_variants_final.vcf']

var_num=0
for variants_vcf_path in vcf_paths_list:
    variants = vcf.Reader(filename=variants_vcf_path)
    for record in variants:
        if record.ID in out_df.index:
            var_num+=1
            lib_design_vcf.write_record(record)
            
            if var_num % 500 == 0:
                print(var_num, 'variants written to file')
lib_design_vcf.close()

print('Total number of variants in design VCF:', var_num)

500 variants written to file
1000 variants written to file
1500 variants written to file
2000 variants written to file
2500 variants written to file
3000 variants written to file
3500 variants written to file
4000 variants written to file
4500 variants written to file
5000 variants written to file
5500 variants written to file
6000 variants written to file
6500 variants written to file
7000 variants written to file
7500 variants written to file
8000 variants written to file
8500 variants written to file
Total number of variants in design VCF: 8553


In [19]:
for i, j in gxe_vars.groupby(['pool_num', 'assoc_traits']).num_of_oligos.sum().items():
    print(i, j)

(84, "['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 118
(85, "['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 118
(86, "['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 117
(87, "['Caffeine;15mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 118
(88, "['Caffeine;15mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 118
(89, "['Caffeine;15mM;2', 'Fluconazole;100uM;2', 'Neomycin;5mg/mL;2']") 117
(90, "['Cobalt_Chloride;2mM;2', 'Fructose;;1', 'Neomycin;5mg/mL;2']") 118
(91, "['Lithium_Chloride;100mM;2', 'Neomycin;5mg/mL;2']") 118
(92, "['Cobalt_Chloride;2mM;2', 'Fluconazole;100uM;2']") 117
(93, "['Caffeine;15mM;2', 'Cobalt_Chloride;2mM;2']") 117
(94, "['Cobalt_Chloride;2mM;2', 'Fructose;;1']") 118
(95, "['Lithium_Chloride;100mM;2', 'Neomycin;5mg/mL;2']") 28
(95, "['Lithium_Chloride;100mM;2']") 90
(96, "['Lithium_Chloride;100mM;2']") 118
(97, "['Lithium_Chloride;100mM;2']") 118
(98, "['Lithium_Chloride;100mM;2']")