# Variant selection for CRISPEY3 library

In [1]:
import os, vcf, re
import multiprocessing as mp
from Bio import SeqIO
from collections import OrderedDict, namedtuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("Number of available cores:", len(os.sched_getaffinity(0)))

Number of available cores: 20


# Functions

In [2]:
def select_variants(genes_df, vcf_file, chrom_dict, upstream_length=500, downstream_length=500):
    '''
    given a dataframe of genes of interest, searches vcf for variants in and around that region
    can be parallelized by splitting genes and vcf by chromosome
    '''
    genes_df = genes_df.sort_values('start')
    # define start and end points to search depending on gene strand, and the upstream/downstream search parameters
    genes_df.loc[genes_df['strand']=='+', 'start_upstream'] = genes_df['start']-upstream_length
    genes_df.loc[genes_df['strand']=='+', 'end_downstream'] = genes_df['end']+downstream_length
    genes_df.loc[genes_df['strand']=='-', 'start_upstream'] = genes_df['start']-downstream_length
    genes_df.loc[genes_df['strand']=='-', 'end_downstream'] = genes_df['end']+upstream_length
    genes_df.loc[:,'start_upstream'] = genes_df['start_upstream'].astype(int)
    genes_df.loc[:,'end_downstream'] = genes_df['end_downstream'].astype(int)

    # read in vcf file to parse for variants
    allVars = vcf.Reader(filename=vcf_file)
    candVars = []
    for i, row in genes_df.iterrows():
        retrieve = allVars.fetch(chrom=chrom_dict[row['chrom']], 
                                 start=row['start_upstream']-1,
                                 end=row['end_downstream'])
        for rec in retrieve:
            rec.samples=[]
            rec.FORMAT=None
            candVars.append(rec)
    candVars = pd.Series(candVars).drop_duplicates().reset_index(drop=True)
    return candVars

def ngg_check(sequence, threshold):
    '''
    searches a Bio.seq object for NGG sequence around the center of the sequence
    set threshold on size of window to search for NGG (recommended: 10)
    returns a boolean of whether one or more NGG found
    '''
    hits=0
    midpt = (len(sequence)-1)//2
    for s in [sequence[midpt-2:midpt+threshold], sequence.reverse_complement()[midpt-2:midpt+threshold]]:
        matches = re.finditer('.GG', str(s), re.I)
        index_list = [m.start() for m in matches]
        hits+=len(index_list)
    if hits > 0:
        return True
    else:
        return False

def search_genome(sequence, genome):
    '''
    searches a genome (stored as dictionary of SeqRecords) for a Bio.Seq sequence (exact string match)
    returns number of occurences of the sequence in the genome
    '''
    hits = 0
    for scaffold in genome.values():
        # search both sequence and its reverse complement to find hits on forward and reverse strands
        for s in [sequence, sequence.reverse_complement()]:
            matches = re.finditer(str(s), str(scaffold.seq), re.I)
            index_list = [m.start() for m in matches]
            hits+=len(index_list)
    return hits

def filter_targetable_variants(candVars, genomes_list, threshold, window):
    '''
    Takes a list of VCF records, a list of genomes, a threshold for searching NGGs, and a window size for 
    flanking sequences, filters and sorts records into two lists, whether they are targetable in all genomes
    or just in the reference genome.
    '''
    ref_gen = genomes_list[0]
    candVars_allStrains=[]
    candVars_refOnly=[]
    
    for record in candVars:
        edits_in_ref_strain = False
        edits_in_nonref_strain = False
        ##### FILTERING #####
        # 1. Skip variant if near chromosome edge
        seq_window = ref_gen[record.CHROM][record.POS-window//2-1:record.POS+window//2]
        if len(seq_window) != window:
            continue

        # 2. is there an 'NGG' sequence around the center of the seq_window?
        if not ngg_check(sequence=seq_window.seq, threshold=threshold):
            continue

        # 3. check genomes if sequence flanking variant (seq_window) is unique
        if search_genome(seq_window.seq, ref_gen)==1: # reference genome
            edits_in_ref_strain = True
            # check remaining genomes to ensure seq_window is unique hit in all of them
            edits_in_nonref_strain = True
            for gen in genomes_list[1:]:
                if search_genome(seq_window.seq, gen)!=1:
                    edits_in_nonref_strain=False
                    break
        else:
            continue

        # 4. store variant info according to whether it can be edited in all strains, or just reference strain
        if edits_in_ref_strain:
            if edits_in_nonref_strain:
                candVars_allStrains.append(record)
            else:
                candVars_refOnly.append(record)
        else:
            print("Variant skipped: Cannot be edited in reference strain") # sanity check step: should not appear in output as filters in previous steps should remove all such instances
    
    return (candVars_allStrains, candVars_refOnly)

def find_targetable_variants(candVars, genomes_list, threshold, window):
    """
    UPGRADED FROM filter_targetable_variants
    Takes a list of VCF records, a list of genomes (at least 1), a threshold for searching NGGs, and a window size for search sequence.
    Filters records for NGG availability and targetability in each genome.
    Adds a list of integers (0/1) under "TGT" field in VCF INFO section. Number of digits corresponds to genomes provided
    For each genome, 1 indicates unique match for sequence is found, otherwise 0.
    NOTE: REMEMBER TO ADD "TGT" INFO TO VCF HEADER BEFORE WRITING RECORDS TO FILE
    header_info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
    header_info_tgt = header_info('TGT', '.', 'Integer', 'Targetability of variant in tested genomes', None, None)
    """
    ref_gen = genomes_list[0]

    record_list = []
    for record in candVars:
        tgt_code = [0]*len(genomes_list)
        ##### FILTERING #####
        # 1. Skip variant if near chromosome edge
        seq_window = ref_gen[record.CHROM][record.POS-window//2-1:record.POS+window//2]
        if len(seq_window) != window:
            continue

        # 2. is there an 'NGG' sequence around the center of the seq_window?
        if not ngg_check(sequence=seq_window.seq, threshold=threshold):
            continue

        # 3. check sequence flanking variant (seq_window) for unique match in genome
        for i in range(len(genomes_list)):
            gen = genomes_list[i]
            num_of_matches = search_genome(seq_window.seq, gen)
            if num_of_matches==1: # unique match
                tgt_code[i]=1
        # sequence window must match at least one genome in order to be written
        if sum(tgt_code)==0:
            print("Variant at chrom {}, pos {} skipped: No unique matches found in tested genome(s)".format(record.CHROM, record.POS))
            continue
        # 4. add TGT code to variant record, add to record list
        else:
            record.INFO['TGT'] = tgt_code
            record_list.append(record)
        
    return record_list

##### FUNCTIONS TO ANNOTATE VARIANTS, BASED ON EILON'S OLIGO DESIGN PIPELINE #####
def get_genes_len(input_gff_filename):
    with open(input_gff_filename) as f:
        gff_lines = f.readlines()
    gff_lines = [l for l in gff_lines if len(l.split('\t'))>=5 ]    
    gff_lines = [l for l in gff_lines if l.split('\t')[2] == 'gene' ]
    genes_ids = [l.split('ID=')[1].split(';Name=')[0] for l in gff_lines]
    genes_lens = [ int(l.split('\t')[4]) - int(l.split('\t')[3]) + 1 for l in gff_lines]

    gene_le_df = pd.DataFrame({'Gene' : genes_ids, 'len_bp' : genes_lens})
    gene_le_df.set_index(keys='Gene', drop=False, inplace=True)
    gene_le_df.index.name=''
    gene_le_df.drop_duplicates(inplace=True)

    return(gene_le_df)

def get_cdss_len(input_gff_filename):
    with open(input_gff_filename) as f:
        gff_lines = f.readlines()
    gff_lines = [l for l in gff_lines if len(l.split('\t'))>=5 ]
    gff_lines = [l for l in gff_lines if l.split('\t')[2] == 'CDS' ]
    genes_ids = [l.split('Parent=')[1].split(';Name=')[0] for l in gff_lines]
    genes_lens = [ int(l.split('\t')[4]) - int(l.split('\t')[3]) + 1 for l in gff_lines]

    gene_le_df = pd.DataFrame({'Gene' : genes_ids, 'len_bp' : genes_lens})
    gene_le_df.set_index(keys='Gene', drop=False, inplace=True)
    gene_le_df.index.name=''
    gene_le_df = gene_le_df.groupby('Gene').sum()
    gene_le_df['Gene'] = gene_le_df.index
    gene_le_df.index.name=''
    gene_le_df.drop_duplicates(inplace=True)

    return(gene_le_df)

def get_dubious_genes(input_gff_filename):
    with open(input_gff_filename) as f:
        gff_lines = f.readlines()
    gff_lines = [l for l in gff_lines if len(l.split('\t'))>=5 ]
    gff_lines = [l for l in gff_lines if (l.split('\t')[2] == 'gene') & (len(re.findall('dubious', l, flags=re.I))>0)]
    genes_ids = [l.split('ID=')[1].split(';Name=')[0] for l in gff_lines]
    
    return genes_ids

def annotate_variants_by_VEPoutput(vep_input_vcf_filename, vep_output_filename, input_gff_filename, annotated_res_output_filename, id_colname = 'var_id'):
    cdss_len_df = get_cdss_len(input_gff_filename)
    genes_len_df = get_genes_len(input_gff_filename)
    dubious_genes = get_dubious_genes(input_gff_filename)
    
    vep_df = pd.read_table(vep_output_filename, sep='\t', na_values = "-", low_memory=False)
    vep_df = vep_df.loc[~vep_df.Gene.isin(dubious_genes),]
    vep_df.rename(columns={"#Uploaded_variation" : id_colname}, inplace=True)
    
    # prepare annotated variants df using VCF supplied to VEP
    variants_annotated_df = pd.Series([rec for rec in vcf.Reader(filename=vep_input_vcf_filename)])
    var_id_list = []
    assemble = []
    columns = ['CHROM', 'POS', 'REF', 'ALT', 'AC', 'AN' ,'AF']
    for rec in variants_annotated_df:
        var_id_list.append(rec.ID)
        assemble.append((rec.CHROM, rec.POS, rec.REF, rec.ALT[0], rec.INFO['AC'][0], rec.INFO['AN'], rec.INFO['AF'][0]))
    variants_annotated_df = pd.DataFrame.from_records(assemble, index=var_id_list, columns=columns)
    variants_annotated_df.index.name = id_colname
    
    # parse VEP output text file by ID for annotations
    entry_cnt = 0
    for cur_id, anno in vep_df.groupby(id_colname):
        entry_cnt +=1
        if (entry_cnt % 5000 == 0):
            print("parsing entry: %d"% (entry_cnt))
        
        # all up/down stream genes (VEP default search range is 5kb from variant position)
        cur_upstream_all_str =  '|'.join([str(int(d)) for d in anno.DISTANCE[anno.Consequence.isin(['upstream_gene_variant' ])].values if not np.isnan(d)])
        cur_downstream_all_str =  '|'.join([str(int(d)) for d in anno.DISTANCE[anno.Consequence.isin(['downstream_gene_variant' ])].values if not np.isnan(d)])
        
        variants_annotated_df.loc[cur_id, 'upstream_distance_str'] = cur_upstream_all_str
        variants_annotated_df.loc[cur_id, 'downstream_distance_str'] = cur_downstream_all_str

        is_up_down_stream = anno.Consequence.isin(['upstream_gene_variant' ,'downstream_gene_variant'])
        
        # identify intergenic variants
        if np.all(is_up_down_stream):
            left_genes = ( ((anno.Consequence =='upstream_gene_variant') & (anno.STRAND<0)) |
                          ((anno.Consequence =='downstream_gene_variant') & (anno.STRAND>0)) )

            right_genes = ( ((anno.Consequence =='upstream_gene_variant') & (anno.STRAND>0)) |
                              ((anno.Consequence =='downstream_gene_variant') & (anno.STRAND<0)) )

            if (left_genes.sum() > 0):
                ind_of_closest_left = anno.loc[left_genes,'DISTANCE'].idxmin()
                
                variants_annotated_df.loc[cur_id, 'closest_gene1_Gene_Name'] = anno['SYMBOL'][ind_of_closest_left]
                variants_annotated_df.loc[cur_id, 'closest_gene1_Gene_ID'] = anno['Gene'][ind_of_closest_left]
                variants_annotated_df.loc[cur_id, 'closest_gene1_Annotation'] = anno['Consequence'][ind_of_closest_left]
                variants_annotated_df.loc[cur_id, 'closest_gene1_Distance'] = anno['DISTANCE'][ind_of_closest_left]

            if (right_genes.sum() > 0):
                ind_of_closest_right = anno.loc[right_genes,'DISTANCE'].idxmin()
                
                variants_annotated_df.loc[cur_id, 'closest_gene2_Gene_Name'] = anno['SYMBOL'][ind_of_closest_right]
                variants_annotated_df.loc[cur_id, 'closest_gene2_Gene_ID'] = anno['Gene'][ind_of_closest_right]
                variants_annotated_df.loc[cur_id, 'closest_gene2_Annotation'] = anno['Consequence'][ind_of_closest_right]
                variants_annotated_df.loc[cur_id, 'closest_gene2_Distance'] = anno['DISTANCE'][ind_of_closest_right]

            # determine type of noncoding variant
            # Caution: variants without genes on one side will be labelled as "intergenic"
            cur_region = 'intergenic'
            if ((left_genes.sum() > 0) and (right_genes.sum() > 0)): 
                if (anno['Consequence'][ind_of_closest_left] == 'upstream_gene_variant'):
                    if (anno['Consequence'][ind_of_closest_right] == 'upstream_gene_variant'):
                        cur_region = 'bidirectional_promoter'
                    else:
                        cur_region = 'unidirectional_promoter'
                else:
                    if (anno['Consequence'][ind_of_closest_right] == 'upstream_gene_variant'):
                        cur_region = 'unidirectional_promoter'

            variants_annotated_df.loc[cur_id, 'region'] = cur_region

            # index of closest gene
            sel_row = anno.DISTANCE.idxmin()
        
        # for variants that land in a gene, select annotation with highest impact
        else:
            anno = anno.loc[~is_up_down_stream,:]
            if (anno.IMPACT == 'HIGH').sum()>0:
                anno = anno.loc[anno.IMPACT == 'HIGH',:]
            elif (anno.IMPACT == 'MODERATE').sum()>0:
                anno = anno.loc[anno.IMPACT == 'MODERATE',:]
            elif (anno.IMPACT == 'LOW').sum()>0:
                anno = anno.loc[anno.IMPACT == 'LOW',:]
            elif (anno.IMPACT == 'MODIFIER').sum()>0:
                anno = anno.loc[anno.IMPACT == 'MODIFIER',:] #NOTE: includes introns, noncoding transcripts within length of gene
            
            else:
                raise ValueError(anno)

            sel_row = anno.index[0]
            variants_annotated_df.loc[cur_id, 'region'] = anno['Consequence'][sel_row]

        # closest annotation recorded
        variants_annotated_df.loc[cur_id, 'Annotation'] = anno['Consequence'][sel_row]
        variants_annotated_df.loc[cur_id, 'Annotation_Impact'] = anno['IMPACT'][sel_row]
        variants_annotated_df.loc[cur_id, 'Gene_Name'] = anno['SYMBOL'][sel_row]
        variants_annotated_df.loc[cur_id, 'Gene_ID'] = anno['Gene'][sel_row]
        variants_annotated_df.loc[cur_id, 'Transcript_BioType'] = anno['BIOTYPE'][sel_row]
        variants_annotated_df.loc[cur_id, 'Existing_variation'] = anno['Existing_variation'][sel_row]
        variants_annotated_df.loc[cur_id, 'HGVSc'] = anno['HGVSc'][sel_row]
        variants_annotated_df.loc[cur_id, 'HGVSp'] = anno['HGVSp'][sel_row]
        variants_annotated_df.loc[cur_id, 'SWISSPROT'] = anno['SWISSPROT'][sel_row]
        variants_annotated_df.loc[cur_id, 'UNIPARC'] = anno['UNIPARC'][sel_row]
        variants_annotated_df.loc[cur_id, 'BLOSUM62'] = anno['BLOSUM62'][sel_row]
        variants_annotated_df.loc[cur_id, 'HGVSp'] = anno['HGVSp'][sel_row]
#         variants_annotated_df.set_value(index=cur_id,col='VEP_CSN',value=anno['CSN'][sel_row])
        
        # store cDNA and CDS positions
        cdna_pos = anno['cDNA_position'][sel_row]
        cds_pos = anno['CDS_position'][sel_row]
    
        # add additional info for missense and synonymous variants
        if (anno['Consequence'][sel_row] in ['synonymous_variant','missense_variant']):
            # because variants were extracted from 1011 genomes GVCF, some SNPs have trailing bases.
            # Calculate the correct cDNA and CDS positions if so 
            if '-' in cdna_pos:
                cdna_pos = cdna_pos.split('-')[0]
            if '-' in cds_pos:
                cds_pos = cds_pos.split('-')[0]
                
            
            if anno['Gene'][sel_row] in genes_len_df.index:
                cur_gene_len = int(genes_len_df.loc[anno['Gene'][sel_row],'len_bp'])
                variants_annotated_df.loc[cur_id, 'cDNA_len'] = cur_gene_len
                variants_annotated_df.loc[cur_id, 'cDNA_frac'] = int(cdna_pos)/cur_gene_len
            
            if anno['Gene'][sel_row] in cdss_len_df.index:
                cur_cds_len = int(cdss_len_df.loc[anno['Gene'][sel_row],'len_bp'])
                variants_annotated_df.loc[cur_id, 'CDS_len'] = cur_cds_len
                variants_annotated_df.loc[cur_id, 'CDS_frac'] = int(cds_pos)/cur_cds_len
                variants_annotated_df.loc[cur_id, 'AA_len'] = cur_cds_len/3
                variants_annotated_df.loc[cur_id, 'AA_frac'] = int(cds_pos)/cur_cds_len
                
            variants_annotated_df.loc[cur_id, 'cDNA_pos'] = cdna_pos
            variants_annotated_df.loc[cur_id, 'CDS_pos'] = cds_pos
            variants_annotated_df.loc[cur_id, 'AA_pos'] = anno['Protein_position'][sel_row]
            variants_annotated_df.loc[cur_id, 'Amino_acids'] = anno['Amino_acids'][sel_row]
            variants_annotated_df.loc[cur_id, 'Codons'] = anno['Codons'][sel_row]
            variants_annotated_df.loc[cur_id, 'DOMAINS'] = anno['DOMAINS'][sel_row]

    variants_annotated_df.to_csv(annotated_res_output_filename, sep='\t', header=True, index=True)
    
    return(variants_annotated_df)

###############################################################################################################
def find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size=121):
    """
    helper function to find number of pools required to fit a certain number of oligos,
    as well as the average size of the pool
    """
    print('Total number of oligos to fit:', num_of_oligos)

    # num of pools to use
    num_of_pools_lower = num_of_oligos//max_pool_size
    num_of_pools_upper = num_of_oligos//min_pool_size
    display((num_of_pools_lower, num_of_pools_upper))

    # explore pooling options for all possible pool configurations
    for num_of_pools in range(num_of_pools_lower, num_of_pools_upper+1):
        if num_of_pools == 0:
            continue
        print("Use {} pools:".format(num_of_pools))
        oligos_per_pool = min((num_of_oligos // num_of_pools), max_pool_size)
        print("Number of oligos per pool:", oligos_per_pool)
        print("Number of oligos leftover:", num_of_oligos - num_of_pools*oligos_per_pool)
        print("Number of technical oligos:", (complete_pool_size - oligos_per_pool) * num_of_pools)
        print()
    
    # return list of possible pool sizes
    return np.arange(num_of_pools_lower, num_of_pools_upper+1)

# Parse Costanzo dataset to count number of genetic interactions per gene and assemble summary table

In [3]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/"
# list of files to parse for genetic interactions
file_list = ['SGA_ExE.txt', 'SGA_ExN_NxE.txt', 'SGA_NxN.txt']
# summary of genetic interactions output file
out_file = working_dir+'genetic_interactions_summary_by_gene.txt'
os.chdir(working_dir)

In [3]:
df = []
for file in file_list:
    df.append(pd.read_csv(working_dir+file, sep='\t', header = 0))
df = pd.concat(df, axis=0, join='inner')
df.columns = ['query_strain', 'query_allele', 'array_strain', 'array_allele', 'arraytype', 'genetic_interaction_score', 'pval', 'query_smf', 'array_smf', 'dmf', 'dmf_std']
df['genetic_interaction_score_abs'] = df['genetic_interaction_score'].abs()
df['query_gene_id'] = [x.split('_')[0] for x in df['query_strain']]
df['query_gene_name'] = [x.split('-')[0] for x in df['query_allele']]
df['array_gene_id'] = [x.split('_')[0] for x in df['array_strain']]
df['array_gene_name'] = [x.split('-')[0] for x in df['array_allele']]
df['gene_pair'] = df.apply(lambda x: '_'.join(sorted([x['query_gene_id'], x['array_gene_id']])), axis=1)

# get list of genes
all_genes = set(df.query_gene_id).union(set(df.array_gene_id))

# filter down to significant interactions only
pval_threshold=0.05
interaction_threshold=0.08
df = df.query('genetic_interaction_score_abs>@interaction_threshold & pval<@pval_threshold')
display(df.shape)
display(df.head())

(890107, 17)

Unnamed: 0,query_strain,query_allele,array_strain,array_allele,arraytype,genetic_interaction_score,pval,query_smf,array_smf,dmf,dmf_std,genetic_interaction_score_abs,query_gene_id,query_gene_name,array_gene_id,array_gene_name,gene_pair
1,YAL001C_tsq508,tfc3-g349e,YBL026W_tsa1065,lsm2-5001,TSA30,-0.3529,3.591e-06,0.8285,0.9408,0.4266,0.079,0.3529,YAL001C,tfc3,YBL026W,lsm2,YAL001C_YBL026W
6,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa950,stu1-7,TSA30,-0.1294,0.01931,0.8285,0.669,0.4249,0.0482,0.1294,YAL001C,tfc3,YBL034C,stu1,YAL001C_YBL034C
15,YAL001C_tsq508,tfc3-g349e,YBL097W_tsa510,brn1-9,TSA30,-0.0808,5.582e-15,0.8285,0.5464,0.3719,0.0077,0.0808,YAL001C,tfc3,YBL097W,brn1,YAL001C_YBL097W
24,YAL001C_tsq508,tfc3-g349e,YBR029C_tsa1063,cds1-5001,TSA30,-0.1173,8.243e-05,0.8285,0.9007,0.6289,0.0226,0.1173,YAL001C,tfc3,YBR029C,cds1,YAL001C_YBR029C
30,YAL001C_tsq508,tfc3-g349e,YBR060C_tsa311,orc2-2,TSA30,0.2516,4.729e-12,0.8285,0.7384,0.8634,0.0362,0.2516,YAL001C,tfc3,YBR060C,orc2,YAL001C_YBR060C


In [4]:
# find strongest interaction in each pair of genes
gene_pairs_df = df.groupby('gene_pair').apply(lambda x: x.sort_values('genetic_interaction_score_abs', ascending=False).iloc[0])
display(gene_pairs_df.shape)
display(gene_pairs_df.head())

(780653, 17)

Unnamed: 0_level_0,query_strain,query_allele,array_strain,array_allele,arraytype,genetic_interaction_score,pval,query_smf,array_smf,dmf,dmf_std,genetic_interaction_score_abs,query_gene_id,query_gene_name,array_gene_id,array_gene_name,gene_pair
gene_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
YAL001C_YBL010C,YAL001C_tsq508,tfc3-g349e,YBL010C_dma88,ybl010c,DMA30,-0.0818,0.0343,0.8285,0.987,0.736,0.0361,0.0818,YAL001C,tfc3,YBL010C,ybl010c,YAL001C_YBL010C
YAL001C_YBL026W,YAL001C_tsq508,tfc3-g349e,YBL026W_tsa1065,lsm2-5001,TSA30,-0.3529,3.591e-06,0.8285,0.9408,0.4266,0.079,0.3529,YAL001C,tfc3,YBL026W,lsm2,YAL001C_YBL026W
YAL001C_YBL034C,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa950,stu1-7,TSA30,-0.1294,0.01931,0.8285,0.669,0.4249,0.0482,0.1294,YAL001C,tfc3,YBL034C,stu1,YAL001C_YBL034C
YAL001C_YBL097W,YAL001C_tsq508,tfc3-g349e,YBL097W_tsa510,brn1-9,TSA30,-0.0808,5.582e-15,0.8285,0.5464,0.3719,0.0077,0.0808,YAL001C,tfc3,YBL097W,brn1,YAL001C_YBL097W
YAL001C_YBR029C,YAL001C_tsq508,tfc3-g349e,YBR029C_tsa1063,cds1-5001,TSA30,-0.1173,8.243e-05,0.8285,0.9007,0.6289,0.0226,0.1173,YAL001C,tfc3,YBR029C,cds1,YAL001C_YBR029C


In [5]:
# for each gene, record number of positive and negative interactions that meet interaction and pvalue thresholds
gene_list = []
positive_interactions = []
negative_interactions = []
for gene in all_genes:
    sub = gene_pairs_df.query('query_gene_id==@gene | array_gene_id==@gene')
    gene_list.append(gene)
    positive_interactions.append(len(sub[sub['genetic_interaction_score']>interaction_threshold]))
    negative_interactions.append(len(sub[sub['genetic_interaction_score']<interaction_threshold*-1]))

num_of_interactions_df = pd.DataFrame({'positive_interactions':positive_interactions, 'negative_interactions':negative_interactions}, index=gene_list)
num_of_interactions_df = num_of_interactions_df.loc[num_of_interactions_df.sum(axis=1).sort_values(ascending=False).index]
num_of_interactions_df.index.name = 'gene_id'
display(num_of_interactions_df.shape)
display(num_of_interactions_df)

num_of_interactions_df.to_csv(out_file, sep='\t', header=True, index=True)

(5707, 2)

Unnamed: 0_level_0,positive_interactions,negative_interactions
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
YFL039C,1324,2220
YFL034C-B,1150,1897
YJR076C,749,1342
YBL105C,841,1139
YER157W,925,967
...,...,...
YLR148W,2,2
YJL189W,0,3
YAL067W-A,0,3
YJL056C,0,2


# Read S288C GFF file and extract gene information

In [138]:
gene_info_df = {}
gff_file = '/home/users/rang/yeast/genomes/saccharomyces_cerevisiae_R64-1-1_20110208.gff'
char_map = {'%20' : ' ',
            '%2C' : ',',
            '%2F' : '/',
            '%3B' : ';'}

with open(gff_file, 'r') as gff:
    line = gff.readline()
    while line.rstrip() != '###':
        while line[0] == "#":
            line = gff.readline()
            
        chrom, _, feature, start, end, _, strand, _, info = line.rstrip().split('\t')
        if feature=='gene':
            info = {x.split('=')[0] : x.split('=')[1] for x in info.split(';')}
            gene_id = info['ID']
            if 'gene' in info:
                gene_name = info['gene']
            else:
                gene_name = np.nan
                
            if 'Note' in info:
                desc = info['Note']
                for c, r in char_map.items():
                    desc = desc.replace(c, r)
            else:
                desc = np.nan
            # store gene info
            gene_info_df[gene_id] = [chrom, int(start), int(end), strand, gene_name, desc]
        line = gff.readline()
        
gene_info_df = pd.DataFrame.from_dict(gene_info_df, orient='index', columns=['chrom', 'start', 'end', 'strand', 'gene_name', 'description'])

In [139]:
# merge GFF info with genetic interactions df
num_of_interactions_file = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/genetic_interactions_summary_by_gene.txt"
num_of_interactions_df = pd.read_csv(num_of_interactions_file, sep='\t', header = 0, index_col=0)
num_of_interactions_df = num_of_interactions_df.join(gene_info_df).sort_values(['chrom', 'start'])
num_of_interactions_df = num_of_interactions_df.loc[num_of_interactions_df[['positive_interactions', 'negative_interactions']].sum(axis=1).sort_values(ascending=False).index]
display(num_of_interactions_df)

num_of_interactions_df.to_csv(out_file.replace('.txt', '_annotated.txt'), sep='\t', header=True, index=True)

Unnamed: 0_level_0,positive_interactions,negative_interactions,chrom,start,end,strand,gene_name,description
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
YFL039C,1324,2220,chrVI,53260.0,54696.0,-,ACT1,"Actin, structural protein involved in cell pol..."
YFL034C-B,1150,1897,chrVI,63016.0,63993.0,-,MOB2,"Component of the RAM signaling network, that a..."
YJR076C,749,1342,chrX,575354.0,576601.0,-,CDC11,Component of the septin ring of the mother-bud...
YBL105C,841,1139,chrII,14241.0,17696.0,-,PKC1,Protein serine/threonine kinase essential for ...
YER157W,925,967,chrV,484788.0,487193.0,+,COG3,Essential component of the conserved oligomeri...
...,...,...,...,...,...,...,...,...
YFL060C,0,4,chrVI,10301.0,10969.0,-,SNO3,"Protein of unknown function, nearly identical ..."
YJL189W,0,3,chrX,75933.0,76474.0,+,RPL39,Protein component of the large (60S) ribosomal...
YAL067W-A,0,3,chrI,2480.0,2707.0,+,,Putative protein of unknown function; identifi...
YJL056C,0,2,chrX,330431.0,333073.0,-,ZAP1,Zinc-regulated transcription factor; binds to ...


# Selecting variants in ergosterol pathway genes
Survey natural variation in ergosterol genes and test for GxE/GxG effects

In [140]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/ergosterol/"
# list of ergosterol genes
ergosterol_genes_file = working_dir+"ergosterol_biosynthetic_process_genes.txt"
# summary of genetic interactions output file
genetic_interactions_summary_file = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/genetic_interactions_summary_by_gene.txt"

os.chdir(working_dir)

In [141]:
# read in ergosterol genes table
ergosterol_genes = pd.read_csv(ergosterol_genes_file, sep='\t')
ergosterol_genes = ergosterol_genes.rename(columns={'Gene Systematic Name':'gene_id', 'Gene':'gene_name'}).set_index('gene_id')

# annotate ergosterol genes
ergosterol_genes = ergosterol_genes.merge(gene_info_df.drop('gene_name', axis=1), how='left', left_index=True, right_index=True)

# add genetic interaction data
genetic_interactions = pd.read_csv(genetic_interactions_summary_file, sep='\t', index_col=0)
ergosterol_genes = ergosterol_genes.merge(genetic_interactions, how='left', left_index=True, right_index=True)

display(ergosterol_genes)

Unnamed: 0_level_0,gene_name,Details,Mutant Information,Breslow fitness,pathway,chrom,start,end,strand,description,positive_interactions,negative_interactions
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
YPL028W,ERG10,Media: minimal medium Details: Relative fitnes...,reduction of function,0.935,upstream,chrXVI,498096,499292,+,Acetyl-CoA C-acetyltransferase (acetoacetyl-Co...,152.0,263.0
YMR208W,ERG12,Media: minimal medium Details: Relative fitnes...,reduction of function,1.016,upstream,chrXIII,684467,685798,+,"Mevalonate kinase, acts in the biosynthesis of...",27.0,107.0
YML126C,ERG13,Media: minimal medium Details: Relative fitnes...,reduction of function,0.929,upstream,chrXIII,19060,20535,-,3-hydroxy-3-methylglutaryl-CoA (HMG-CoA) synth...,206.0,316.0
YJL167W,ERG20,Media: minimal medium Details: Relative fitnes...,reduction of function,0.995,upstream,chrX,105014,106072,+,"Farnesyl pyrophosphate synthetase, has both di...",,
YMR220W,ERG8,Media: minimal medium Details: Relative fitnes...,reduction of function,0.953,upstream,chrXIII,712316,713671,+,"Phosphomevalonate kinase, an essential cytosol...",102.0,150.0
YHR190W,ERG9,,,,upstream,chrVIII,484845,486179,+,Farnesyl-diphosphate farnesyl transferase (squ...,284.0,423.0
YML075C,HMG1,Media: minimal medium Details: Relative fitnes...,,0.959,upstream,chrXIII,115734,118898,-,One of two isozymes of HMG-CoA reductase that ...,108.0,143.0
YLR450W,HMG2,Media: minimal medium Details: Relative fitnes...,,0.929,upstream,chrXII,1032627,1035764,+,One of two isozymes of HMG-CoA reductase that ...,44.0,113.0
YPL117C,IDI1,,,,upstream,chrXVI,327864,328730,-,Isopentenyl diphosphate%3Adimethylallyl diphos...,78.0,168.0
YNR043W,MVD1,Media: minimal medium Details: Relative fitnes...,reduction of function,0.909,upstream,chrXIV,701895,703085,+,"Mevalonate pyrophosphate decarboxylase, essent...",176.0,364.0


In [181]:
# search ergosterol genes for variants in each gvcf
chromosome_dict = {'chrI' : 'chromosome1',
                   'chrII': 'chromosome2',
                   'chrIII': 'chromosome3',
                   'chrIV': 'chromosome4',
                   'chrV': 'chromosome5',
                   'chrVI': 'chromosome6',
                   'chrVII': 'chromosome7',
                   'chrVIII': 'chromosome8',
                   'chrIX': 'chromosome9',
                   'chrX': 'chromosome10',
                   'chrXI': 'chromosome11',
                   'chrXII': 'chromosome12',
                   'chrXIII': 'chromosome13',
                   'chrXIV': 'chromosome14',
                   'chrXV': 'chromosome15',
                   'chrXVI': 'chromosome16',}



genes_to_process = []
for chrom in chromosome_dict.keys():
    genes_df = ergosterol_genes.query('chrom==@chrom').sort_values('start')
    gvcf_file = "/home/users/rang/share/yeast/1011genomes/by_chrom/{}.gvcf.gz".format(chromosome_dict[chrom])
    genes_to_process.append((genes_df, gvcf_file, chromosome_dict))
    
# consolidate candidate variants
num_of_cores = min(len(os.sched_getaffinity(0)), len(genes_to_process))
with mp.Pool(num_of_cores) as pool:
    results = pool.starmap(select_variants, genes_to_process)
results = pd.concat(results).reset_index(drop=True)

# write to new vcf file
ergosterol_variants_file = "ergosterol_variants.vcf"
template = vcf.Reader(filename=genes_to_process[0][1])
# change chromosome naming convention to a single Roman numeral (follows ref genome, VEP formatting)
chromosome_dict_rev = {v:k for k, v in chromosome_dict.items()} 
template.contigs = OrderedDict((chromosome_dict_rev[k][3:], v._replace(id=chromosome_dict_rev[k][3:])) for k, v in template.contigs.items())
# remove 1011 genomes samples data
template.samples=[]

output = vcf.Writer(open(ergosterol_variants_file, 'w'), template)
for record in results:
    # update chromosome naming
    record.CHROM = chromosome_dict_rev[record.CHROM][3:]
    # write to file
    output.write_record(record)
output.close()

In [182]:
# path to genome fasta files
ref_genome_fasta = "/home/users/rang/yeast/genomes/Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa"
rm_genome_fasta = "/home/users/rang/yeast/genomes/RM11-1A_SGD_2015_JRIP00000000.fsa"
yjm_genome_fasta = "/home/users/rang/yeast/genomes/YJM789_Stanford_2007_AAFW02000000_highQuality31.fsa"
yps_genome_fasta = "/home/users/rang/yeast/genomes/YPS128.genome.fa"

# path to input and output VCF files
ergosterol_variants_file = "ergosterol_variants.vcf"
ergosterol_variants_select_file = ergosterol_variants_file.replace('.vcf', '_select.vcf')

# load genome files
ref_genome = SeqIO.to_dict(SeqIO.parse(ref_genome_fasta, "fasta"))
rm_genome = SeqIO.to_dict(SeqIO.parse(rm_genome_fasta, "fasta"))
yjm_genome = SeqIO.to_dict(SeqIO.parse(yjm_genome_fasta, "fasta"))
yps_genome = SeqIO.to_dict(SeqIO.parse(yps_genome_fasta, "fasta"))
genomes_list = [ref_genome, rm_genome, yjm_genome, yps_genome] # IMPORTANT: SET REFERENCE GENOME AS FIRST IN LIST


threshold = 10 # threshold value for searching NGGs
window = 61 # size of flanking sequences to search for unique hits

# read in all candidate variants to filter
ergosterol_variants = vcf.Reader(filename=ergosterol_variants_file)
ergosterol_variants = [rec for rec in ergosterol_variants]
# split variants into chunks for multiprocessing
n = len(ergosterol_variants)//len(os.sched_getaffinity(0)) + 1
ergosterol_variants = [ergosterol_variants[i:i+n] for i in range(0, len(ergosterol_variants), n)]

# find targetable variants (TGT info field indicates which genomes variant is targetable in)
to_process = [(rec_list, genomes_list, threshold, window) for rec_list in ergosterol_variants]
with mp.Pool(len(os.sched_getaffinity(0))) as pool:
    result = pool.starmap(find_targetable_variants, to_process) 

# write targetable variants to file
header = vcf.Reader(filename=ergosterol_variants_file)
# add "TGT" info field into VCF header
header_info = namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
header_info_tgt = header_info('TGT', '.', 'Integer', 'Targetability of variant in tested genomes', None, None)
header.infos['TGT'] = header_info_tgt
# write to file
ergosterol_variants_select = vcf.Writer(open(ergosterol_variants_select_file, 'w'), header)
to_write = [rec for rec_list in result for rec in rec_list]
for rec in to_write:
    ergosterol_variants_select.write_record(rec)
ergosterol_variants_select.close()


In [200]:
# select variants by allele frequency (currently disabled singleton/doubleton filter)
### if multiple alternate alleles, SPLIT VARIANT ENTRY INTO SEPARATE ENTRIES in VCF (useful for oligo design pipeline)
### assign unitl variant ID prefix and number (use format VAR_XXXXX)

# candidate variants editable in all strains
ergosterol_variants_select = vcf.Reader(filename=ergosterol_variants_select_file)

# open output file
ergosterol_variants_select_AFfilter_file = ergosterol_variants_select_file.replace('.vcf', '_AFfilter.vcf')
ergosterol_variants_select_AFfilter = vcf.Writer(open(ergosterol_variants_select_AFfilter_file, 'w'), ergosterol_variants_select)

var_counter = {}
for record in ergosterol_variants_select:        
    # (optional step to skip singletons and doubletons -- DISABLED by setting ac<1)
    if all([ac<1 for ac in record.INFO['AC']]):
        continue
    # if at least one alternate allele meets minimum count, write to VCF
    else:
        alt = record.ALT[:]
        ac = record.INFO['AC'][:]
        af = record.INFO['AF'][:]
        mleac = record.INFO['MLEAC'][:]
        mleaf = record.INFO['MLEAF'][:]
        
        # split each alt allele into its own variant entry (useful if a locus has multiple possible alleles)
        for i in range(len(alt)):
            record.ALT = [alt[i]]
            record.INFO['AC'] = [ac[i]]
            record.INFO['AF'] = [af[i]]
            record.INFO['MLEAC'] = [mleac[i]]
            record.INFO['MLEAF'] = [mleaf[i]]

            # inspect TGT field and assign variant ID prefix
            ### if tgt field contains all 1 (i.e. targetable by all queried genomes), assign "ERG"
            ### otherwise, assign unique prefix "EG-" with the last digit determined by converting binary to hexadecimal (works well for up to 4 genomes)
            if all([i==1 for i in record.INFO['TGT']]):
                var_id_prefix = "ERG"
            else:
                binary_str = ''.join([str(x) for x in record.INFO['TGT']])
                hex_digit = hex(int(binary_str, 2))[2:].upper()
                var_id_prefix = "EG{}".format(hex_digit)

            # add to counter
            if var_id_prefix not in var_counter:
                var_counter[var_id_prefix] = 1
            else:
                var_counter[var_id_prefix] += 1

            # add unique variant ID (follow format VAR_XXXXX)
            record.ID='{}_{:05d}'.format(var_id_prefix, var_counter[var_id_prefix])

            # write to output
            ergosterol_variants_select_AFfilter.write_record(record)
            
ergosterol_variants_select_AFfilter.close()


### Variant ID prefix and which genomes they can be edited in:
- ERG: BY | RM | YJM | YPS
- EGE: BY | RM | YJM | ---
- EGD: BY | RM | --- | YPS
- EGC: BY | RM | --- | ---
- EGB: BY | --- | YJM | YPS
- EGA: BY | --- | YJM | ---
- EG9: BY | --- | --- | YPS
- EG8: BY | --- | --- | ---


## Submit ergosterol_variants_select_AFfilter VCF to VEP to get annotations, and download VEP output in TXT format
http://uswest.ensembl.org/Saccharomyces_cerevisiae/Tools/VEP

In [875]:
# add annotations to variants and remove variants that fall in non-ergosterol genes.

ergosterol_variants_file = "ergosterol_variants.vcf"
ergosterol_variants_select_file = ergosterol_variants_file.replace('.vcf', '_select.vcf')

# get VEP annotations and remove variants that fall in non-ergosterol pathway genes
ergosterol_variants_select_AFfilter_file = ergosterol_variants_select_file.replace('.vcf', '_AFfilter.vcf')
vep_output_filename = ergosterol_variants_select_AFfilter_file.replace('.vcf', '_VEPoutput.txt')
variants_annotated_filename = vep_output_filename.replace('_VEPoutput.txt', '_annotated.txt')
variants_annotated_df = annotate_variants_by_VEPoutput(ergosterol_variants_select_AFfilter_file, 
                                                       vep_output_filename, 
                                                       gff_file, 
                                                       variants_annotated_filename, 
                                                       id_colname='var_id')
display(variants_annotated_df)

# annotated file can be read in directly if annotation function has been run once before
variants_annotated_df = pd.read_csv(variants_annotated_filename, sep='\t', index_col=0)
display(variants_annotated_df)

# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
ergosterol_design_oligos = variants_annotated_df.query('(Gene_ID.isin(@ergosterol_genes.index) | region.isin(@intergenic_regions_list))').copy()
display(ergosterol_design_oligos)

# write variants to ergosterol_design_oligos VCF as a preliminary list to design oligos for
ergosterol_design_oligos_vcf_file = ergosterol_variants_file.replace('.vcf', '_design_oligos_initial.vcf')

ergosterol_variants_select_AFfilter = vcf.Reader(filename=ergosterol_variants_select_AFfilter_file)
ergosterol_design_oligos_vcf = vcf.Writer(open(ergosterol_design_oligos_vcf_file, 'w'), ergosterol_variants_select_AFfilter)
for record in ergosterol_variants_select_AFfilter:
    if record.ID in ergosterol_design_oligos.index:
        ergosterol_design_oligos_vcf.write_record(record)
ergosterol_design_oligos_vcf.close()

# # count variants according to which strains they can be edited in
# display(pd.Series([x[:3] for x in ergosterol_design_oligos.index]).value_counts())

Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERG_00001,V,237113,C,T,1,2022,0.000495,,2373|1622|1347|457|2919,missense_variant,...,Gct/Act,"HAMAP:MF_00563,Gene3D:3.40.50.1480,PIRSF:PIRSF...",,,,,,,,
ERG_00002,V,237121,G,A,9,2022,0.004451,2,2381|1630|1339|449|2911,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,2.0,ERG28,YER044C,downstream_gene_variant,449.0
ERG_00003,V,237122,G,A,9,2022,0.004451,3,2382|1631|1338|448|2910,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,3.0,ERG28,YER044C,downstream_gene_variant,448.0
ERG_00004,V,237136,T,C,26,2022,0.013000,17,2396|1645|1324|434|2896,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,17.0,ERG28,YER044C,downstream_gene_variant,434.0
EGE_00001,V,237149,A,C,43,2022,0.021000,30,2409|1658|1311|421|2883,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,30.0,ERG28,YER044C,downstream_gene_variant,421.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EGB_00245,XVI,499756,G,A,157,2018,0.078000,3761,4583|919|464|2037|4510,missense_variant,...,aGt/aAt,,,,,,,,,
EGB_00246,XVI,499757,T,C,2,2020,0.000990,3760,4582|918|465|2038|4511,synonymous_variant,...,agT/agC,,,,,,,,,
EGB_00247,XVI,499761,G,C,1,2020,0.000495,3756,4578|914|469|2042|4515,missense_variant,...,Ggt/Cgt,,,,,,,,,
ERG_02770,XVI,499778,A,G,151,2020,0.075000,3739,4561|897|486|2059|4532,synonymous_variant,...,aaA/aaG,,,,,,,,,


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERG_00002,V,237121,G,A,9,2022,0.004451,2,2381|1630|1339|449|2911,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,2.0,ERG28,YER044C,downstream_gene_variant,449.0
ERG_00003,V,237122,G,A,9,2022,0.004451,3,2382|1631|1338|448|2910,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,3.0,ERG28,YER044C,downstream_gene_variant,448.0
ERG_00004,V,237136,T,C,26,2022,0.013000,17,2396|1645|1324|434|2896,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,17.0,ERG28,YER044C,downstream_gene_variant,434.0
EGE_00001,V,237149,A,C,43,2022,0.021000,30,2409|1658|1311|421|2883,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,30.0,ERG28,YER044C,downstream_gene_variant,421.0
EGE_00002,V,237151,T,C,2,2022,0.000989,32,2411|1660|1309|419|2881,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,32.0,ERG28,YER044C,downstream_gene_variant,419.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EGA_00041,XVI,499655,G,A,4,2022,0.001978,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0
EGA_00042,XVI,499655,G,T,35,2022,0.017000,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0
EGA_00043,XVI,499656,A,G,26,2022,0.013000,3861|9,4683|1019|364|1937|4410,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,364.0,SMA1,YPL027W,upstream_gene_variant,9.0
EGA_00044,XVI,499659,G,A,1,2022,0.000495,3858|6,4680|1016|367|1940|4413,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,367.0,SMA1,YPL027W,upstream_gene_variant,6.0


## Design oligos for initial list, then filter down to variants with at least 2 oligos designed

In [142]:
oligo_file = os.path.expanduser("~/crispey3/initial_design/Output/all_SNPs_ergosterol_initial_GG_9bp_OLIGO.tab")
ergosterol_oligo_df = pd.read_csv(oligo_file, sep='\t')

ergosterol_design_oligos = pd.read_csv("/home/users/rang/scratch/yeast/ergosterol/ergosterol_variants_select_AFfilter_annotated.txt", sep='\t', index_col=0)
# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
ergosterol_design_oligos = ergosterol_design_oligos.query('(Gene_ID.isin(@ergosterol_genes.index) | region.isin(@intergenic_regions_list))').copy()
display(ergosterol_design_oligos)

ergosterol_design_oligos['num_of_oligos'] = ergosterol_oligo_df.groupby('var_id').apply(len)
ergosterol_oligos_final = ergosterol_design_oligos.query('num_of_oligos>1').copy()
display(ergosterol_oligos_final)

Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERG_00002,V,237121,G,A,9,2022,0.004451,2,2381|1630|1339|449|2911,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,2.0,ERG28,YER044C,downstream_gene_variant,449.0
ERG_00003,V,237122,G,A,9,2022,0.004451,3,2382|1631|1338|448|2910,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,3.0,ERG28,YER044C,downstream_gene_variant,448.0
ERG_00004,V,237136,T,C,26,2022,0.013000,17,2396|1645|1324|434|2896,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,17.0,ERG28,YER044C,downstream_gene_variant,434.0
EGE_00001,V,237149,A,C,43,2022,0.021000,30,2409|1658|1311|421|2883,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,30.0,ERG28,YER044C,downstream_gene_variant,421.0
EGE_00002,V,237151,T,C,2,2022,0.000989,32,2411|1660|1309|419|2881,unidirectional_promoter,...,,,SAH1,YER043C,upstream_gene_variant,32.0,ERG28,YER044C,downstream_gene_variant,419.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EGA_00041,XVI,499655,G,A,4,2022,0.001978,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0
EGA_00042,XVI,499655,G,T,35,2022,0.017000,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0
EGA_00043,XVI,499656,A,G,26,2022,0.013000,3861|9,4683|1019|364|1937|4410,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,364.0,SMA1,YPL027W,upstream_gene_variant,9.0
EGA_00044,XVI,499659,G,A,1,2022,0.000495,3858|6,4680|1016|367|1940|4413,unidirectional_promoter,...,,,ERG10,YPL028W,downstream_gene_variant,367.0,SMA1,YPL027W,upstream_gene_variant,6.0


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERG_00003,V,237122,G,A,9,2022,0.004451,3,2382|1631|1338|448|2910,unidirectional_promoter,...,,SAH1,YER043C,upstream_gene_variant,3.0,ERG28,YER044C,downstream_gene_variant,448.0,2.0
ERG_00004,V,237136,T,C,26,2022,0.013000,17,2396|1645|1324|434|2896,unidirectional_promoter,...,,SAH1,YER043C,upstream_gene_variant,17.0,ERG28,YER044C,downstream_gene_variant,434.0,2.0
EGE_00008,V,237191,T,TG,661,2022,0.327000,72,2451|1700|1268|378|2840,unidirectional_promoter,...,,SAH1,YER043C,upstream_gene_variant,72.0,ERG28,YER044C,downstream_gene_variant,378.0,2.0
EGE_00009,V,237191,T,TA,6,2022,0.002967,72,2451|1700|1268|378|2840,unidirectional_promoter,...,,SAH1,YER043C,upstream_gene_variant,72.0,ERG28,YER044C,downstream_gene_variant,378.0,2.0
EGE_00010,V,237196,A,T,69,2022,0.034000,77,2456|1705|1264|374|2836,unidirectional_promoter,...,,SAH1,YER043C,upstream_gene_variant,77.0,ERG28,YER044C,downstream_gene_variant,374.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EGA_00039,XVI,499649,G,A,4,2022,0.001978,3868|16,4690|1026|357|1930|4403,unidirectional_promoter,...,,ERG10,YPL028W,downstream_gene_variant,357.0,SMA1,YPL027W,upstream_gene_variant,16.0,2.0
EGA_00040,XVI,499650,G,C,5,2022,0.002473,3867|15,4689|1025|358|1931|4404,unidirectional_promoter,...,,ERG10,YPL028W,downstream_gene_variant,358.0,SMA1,YPL027W,upstream_gene_variant,15.0,2.0
EGA_00041,XVI,499655,G,A,4,2022,0.001978,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0,2.0
EGA_00042,XVI,499655,G,T,35,2022,0.017000,3862|10,4684|1020|363|1936|4409,unidirectional_promoter,...,,ERG10,YPL028W,downstream_gene_variant,363.0,SMA1,YPL027W,upstream_gene_variant,10.0,2.0


In [143]:
# write ergosterol_oligos_final table to file
ergosterol_oligos_final.to_csv("/home/users/rang/scratch/yeast/ergosterol/ergosterol_variants_final.txt", sep='\t')
# use this file in separate script to organize into pools

## Summary of ergosterol variants

In [144]:
### SUMMARY OF ERGOSTEROL SET ###
print("Number of variants:", len(ergosterol_oligos_final))
print("Number of oligos:", int(sum(ergosterol_oligos_final.num_of_oligos)))
print()
print("Distribution of variants among ergosterol genes")
print(ergosterol_oligos_final.Gene_ID.value_counts().filter(ergosterol_genes.index).describe())
print()
print("Distribution of oligos among ergosterol variants")
print(ergosterol_oligos_final.num_of_oligos.describe())
print()
print("Proportion of variant annotation classes")
print(ergosterol_oligos_final.region.value_counts()/len(ergosterol_oligos_final))


Number of variants: 1580
Number of oligos: 3968

Distribution of variants among ergosterol genes
count     26.000000
mean      49.384615
std       26.023185
min       19.000000
25%       32.250000
50%       42.000000
75%       57.000000
max      114.000000
Name: Gene_ID, dtype: float64

Distribution of oligos among ergosterol variants
count    1580.000000
mean        2.511392
std         0.858882
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: num_of_oligos, dtype: float64

Proportion of variant annotation classes
synonymous_variant         0.372152
missense_variant           0.236076
unidirectional_promoter    0.183544
bidirectional_promoter     0.149367
intergenic                 0.053165
frameshift_variant         0.002532
stop_gained                0.001899
inframe_deletion           0.001266
Name: region, dtype: float64


## Determine number of pools to use for ergosterol oligos
If a variant set has too few oligos to form its own pool, we can group it with another variant set with similar editable genomes
- ERG --> assign excess to EGE/EGD/EGB
- EGE, EGD --> assign excess to EGC 
- EGC, EGB, EGA, EG9 --> assign excess to EG8


In [145]:
# count OLIGOS in each variant prefix set
display(ergosterol_oligos_final.groupby(lambda x: x[:3]).num_of_oligos.sum().sort_values(ascending=False))

# determine number of pools to use
num_of_oligos = int(ergosterol_oligos_final.num_of_oligos.sum())
min_pool_size = 109 # allows 12 technical oligos
max_pool_size = 118 # allows 3 technical oligos
complete_pool_size = 121 # maximum oligos synthesized per pool

# find number of pools to use
find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size)

ERG    2479.0
EGE     469.0
EG8     253.0
EGB     209.0
EG9     196.0
EGC     193.0
EGD     134.0
EGA      35.0
Name: num_of_oligos, dtype: float64

Total number of oligos to fit: 3968


(33, 36)

Use 33 pools:
Number of oligos per pool: 118
Number of oligos leftover: 74
Number of technical oligos: 99

Use 34 pools:
Number of oligos per pool: 116
Number of oligos leftover: 24
Number of technical oligos: 170

Use 35 pools:
Number of oligos per pool: 113
Number of oligos leftover: 13
Number of technical oligos: 280

Use 36 pools:
Number of oligos per pool: 110
Number of oligos leftover: 8
Number of technical oligos: 396



array([33, 34, 35, 36])

### Variant ID prefix and which genomes they can be edited in:
- ERG: BY | RM | YJM | YPS
- EGE: BY | RM | YJM | ---
- EGD: BY | RM | --- | YPS
- EGC: BY | RM | --- | ---
- EGB: BY | --- | YJM | YPS
- EGA: BY | --- | YJM | ---
- EG9: BY | --- | --- | YPS
- EG8: BY | --- | --- | ---


### We should be able to fit all ergosterol variants in 34 pools
- ERG: 21 pools (assign spillover to EGE)
- EGE: 4 pools (assign spillover to EGC)
- EGD: 1 pool, (assign spillover to EGC)
- EGC: 2 pools, (assign spillover to EG8)
- EGB: 1 pool (assign spillover to EGA)
- EGA: 1 pool, (assign spillover to EG8)
- EG9: 1 pool, (assign spillover to EG8)
- EG8: 3 pools

### Based on the aforementioned oligo arrangement into pools, each yeast strain will be assayed by the following pools:
- BY: 34 pools (all pools)
- RM: 28 pools (21 ERG, 4 EGE, 1 EGD, 2 EGC pools)
- YJM: 27 pools (21 ERG, 4 EGE, 1 EGB, 1 EGA pools)
- YPS: 24 pools (21 ERG, 1 EGD, 1 EGB, 1 EG9 pools)

# Selecting variants in genes with high genetic interactions
Survey natural variation in "hub" genes to identify GxG effects

In [189]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/"
# summary of genetic interactions output file
genetic_interactions_summary_file = working_dir+'genetic_interactions_summary_by_gene.txt'
os.chdir(working_dir)


In [190]:
# examine top genes with most genetic interactions
genes_cutoff = 800

num_of_interactions_df = pd.read_csv(genetic_interactions_summary_file.replace('.txt', '_annotated.txt'), sep='\t', header = 0, index_col=0)
num_of_interactions_df = num_of_interactions_df.dropna().astype({'start':'int', 'end':'int'})

# remove dubious orfs
dubious_orfs_file = working_dir+'dubious_orfs.tsv'
dubious_orfs = pd.read_csv(dubious_orfs_file, sep='\t', header=None, names=['gene_id', 'description'])
num_of_interactions_df = num_of_interactions_df.query('~gene_id.isin(@dubious_orfs.gene_id)')


# remove genes in excluded list
genes_to_exclude = ['YDL227C', # HO
                    'YOR202W', # HIS3
                    'YEL021W', # URA3
                    'YCL018W', # LEU2
                    'YBR115C', # LYS2
                    'YBR020W', # GAL1
                    'YDR009W', # GAL3
                    'YPL248C', # GAL4
                    'YBR018C', # GAL7
                    'YBR019C', # GAL10
                    'YML051W', # GAL80
                    'YLR256W'] # HAP1
genes_to_exclude = genes_to_exclude + ergosterol_genes.index.tolist() # exclude ergosterol pathway genes that already have oligos synthesized in separate set
num_of_interactions_df = num_of_interactions_df.query('~gene_id.isin(@genes_to_exclude)')


# find top genes with most genetic interactions
interaction_rich_genes = num_of_interactions_df.loc[num_of_interactions_df[['positive_interactions', 'negative_interactions']].sum(axis=1).sort_values(ascending=False).index].head(genes_cutoff)

# annotate genes by essentiality
essential_genes_file = working_dir+"S288C_essential_genes.txt"
essential_genes_df = pd.read_csv(essential_genes_file, sep='\t')
interaction_rich_genes.loc[:,'essential'] = interaction_rich_genes.index.isin(essential_genes_df['Gene Systematic Name'])

# annotate genes by paralogy
paralogs_file = working_dir+"S288C_paralogs.txt"
paralogs_df = pd.read_csv(paralogs_file, sep='\t', index_col='Byrne')
paralogs_ka_list = pd.concat([paralogs_df[['Gene 1', 'KA']].rename(columns={'Gene 1':'Gene'}), paralogs_df[['Gene 2', 'KA']].rename(columns={'Gene 2':'Gene'})], axis=0, ignore_index=True).set_index('Gene')
interaction_rich_genes.loc[:,'paralog'] = interaction_rich_genes.index.isin(paralogs_ka_list.index)
interaction_rich_genes.loc[interaction_rich_genes['paralog'],'KA'] = paralogs_ka_list.loc[interaction_rich_genes.index[interaction_rich_genes['paralog']], 'KA']

display(interaction_rich_genes)


Unnamed: 0_level_0,positive_interactions,negative_interactions,chrom,start,end,strand,gene_name,description,essential,paralog,KA
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
YFL039C,1324,2220,chrVI,53260,54696,-,ACT1,"Actin, structural protein involved in cell pol...",True,False,
YFL034C-B,1150,1897,chrVI,63016,63993,-,MOB2,"Component of the RAM signaling network, that a...",True,False,
YJR076C,749,1342,chrX,575354,576601,-,CDC11,Component of the septin ring of the mother-bud...,True,False,
YBL105C,841,1139,chrII,14241,17696,-,PKC1,Protein serine/threonine kinase essential for ...,True,False,
YER157W,925,967,chrV,484788,487193,+,COG3,Essential component of the conserved oligomeri...,True,False,
...,...,...,...,...,...,...,...,...,...,...,...
YLR038C,178,352,chrXII,224921,225172,-,COX12,"Subunit VIb of cytochrome c oxidase, which is ...",False,False,
YOR038C,200,330,chrXV,402761,405388,-,HIR2,"Subunit of the HIR complex, a nucleosome assem...",False,False,
YHR200W,200,330,chrVIII,499079,499885,+,RPN10,Non-ATPase base subunit of the 19S regulatory ...,False,False,
YOR083W,223,306,chrXV,479533,480420,+,WHI5,Repressor of G1 transcription that binds to SC...,False,True,1.484


In [10]:
# search hub genes for variants in each gvcf
chromosome_dict = {'chrI' : 'chromosome1',
                   'chrII': 'chromosome2',
                   'chrIII': 'chromosome3',
                   'chrIV': 'chromosome4',
                   'chrV': 'chromosome5',
                   'chrVI': 'chromosome6',
                   'chrVII': 'chromosome7',
                   'chrVIII': 'chromosome8',
                   'chrIX': 'chromosome9',
                   'chrX': 'chromosome10',
                   'chrXI': 'chromosome11',
                   'chrXII': 'chromosome12',
                   'chrXIII': 'chromosome13',
                   'chrXIV': 'chromosome14',
                   'chrXV': 'chromosome15',
                   'chrXVI': 'chromosome16',}

genes_to_process = []
for chrom in chromosome_dict.keys():
    genes_df = interaction_rich_genes.query('chrom==@chrom').sort_values('start')
    gvcf_file = "/home/users/rang/share/yeast/1011genomes/by_chrom/{}.gvcf.gz".format(chromosome_dict[chrom])
    genes_to_process.append((genes_df, gvcf_file, chromosome_dict))

# consolidate candidate variants
num_of_cores = min(len(os.sched_getaffinity(0)), len(genes_to_process))
with mp.Pool(num_of_cores) as pool:
    results = pool.starmap(select_variants, genes_to_process)
results = pd.concat(results).reset_index(drop=True)

# write to new vcf file
gxg_variants_file = "gxg_variants.vcf"
template = vcf.Reader(filename=genes_to_process[0][1])
# change chromosome naming convention to a single Roman numeral (follows ref genome, VEP formatting)
chromosome_dict_rev = {v:k for k, v in chromosome_dict.items()} 
template.contigs = OrderedDict((chromosome_dict_rev[k][3:], v._replace(id=chromosome_dict_rev[k][3:])) for k, v in template.contigs.items())
# remove 1011 genomes samples data
template.samples=[]

output = vcf.Writer(open(gxg_variants_file, 'w'), template)
for record in results:
    # update chromosome naming
    record.CHROM = chromosome_dict_rev[record.CHROM][3:]
    # write to file
    output.write_record(record)
output.close()


In [11]:
# path to genome fasta files
ref_genome_fasta = "/home/users/rang/yeast/genomes/Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa"
rm_genome_fasta = "/home/users/rang/yeast/genomes/RM11-1A_SGD_2015_JRIP00000000.fsa"
yjm_genome_fasta = "/home/users/rang/yeast/genomes/YJM789_Stanford_2007_AAFW02000000_highQuality31.fsa"
yps_genome_fasta = "/home/users/rang/yeast/genomes/YPS128.genome.fa"

# path to input and output VCF files
gxg_variants_file = "gxg_variants.vcf"
gxg_variants_select_file = gxg_variants_file.replace('.vcf', '_select.vcf')

# load genome files
ref_genome = SeqIO.to_dict(SeqIO.parse(ref_genome_fasta, "fasta"))
rm_genome = SeqIO.to_dict(SeqIO.parse(rm_genome_fasta, "fasta"))
yjm_genome = SeqIO.to_dict(SeqIO.parse(yjm_genome_fasta, "fasta"))
yps_genome = SeqIO.to_dict(SeqIO.parse(yps_genome_fasta, "fasta"))
genomes_list = [ref_genome, rm_genome, yjm_genome, yps_genome] # IMPORTANT: SET REFERENCE GENOME AS FIRST IN LIST


threshold = 10 # threshold value for searching NGGs
window = 61 # size of flanking sequences to search for unique hits

# read in all candidate variants to filter
gxg_variants = vcf.Reader(filename=gxg_variants_file)
gxg_variants = [rec for rec in gxg_variants]
# split variants into chunks for multiprocessing
n = len(gxg_variants)//len(os.sched_getaffinity(0)) + 1
gxg_variants = [gxg_variants[i:i+n] for i in range(0, len(gxg_variants), n)]

# find targetable variants (TGT info field indicates which genomes variant is targetable in)
to_process = [(rec_list, genomes_list, threshold, window) for rec_list in gxg_variants]
with mp.Pool(len(os.sched_getaffinity(0))) as pool:
    result = pool.starmap(find_targetable_variants, to_process) 

# write targetable variants to file
header = vcf.Reader(filename=gxg_variants_file)
# add "TGT" info field into VCF header
header_info = namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
header_info_tgt = header_info('TGT', '.', 'Integer', 'Targetability of variant in tested genomes', None, None)
header.infos['TGT'] = header_info_tgt
# write to file
gxg_variants_select = vcf.Writer(open(gxg_variants_select_file, 'w'), header)
to_write = [rec for rec_list in result for rec in rec_list]
for rec in to_write:
    gxg_variants_select.write_record(rec)
gxg_variants_select.close()


Variant at chrom IV, pos 434292 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 434294 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 434307 skipped: No unique matches found in tested genome(s)
Variant at chrom XIV, pos 574011 skipped: No unique matches found in tested genome(s)
Variant at chrom XIV, pos 574012 skipped: No unique matches found in tested genome(s)
Variant at chrom XIV, pos 574018 skipped: No unique matches found in tested genome(s)
Variant at chrom V, pos 86643 skipped: No unique matches found in tested genome(s)
Variant at chrom V, pos 86648 skipped: No unique matches found in tested genome(s)
Variant at chrom XI, pos 162514 skipped: No unique matches found in tested genome(s)
Variant at chrom VI, pos 117522 skipped: No unique matches found in tested genome(s)
Variant at chrom VI, pos 117533 skipped: No unique matches found in tested genome(s)
Variant at chrom VI, pos 117538 skipped: No unique matches found i

Variant at chrom IV, pos 2033 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2035 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2036 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2040 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2041 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2043 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2044 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2045 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2046 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2047 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2049 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 2052 skipped: No unique matches found in tested genome(s)
Vari

Variant at chrom XII, pos 941425 skipped: No unique matches found in tested genome(s)
Variant at chrom XII, pos 941431 skipped: No unique matches found in tested genome(s)
Variant at chrom XII, pos 941477 skipped: No unique matches found in tested genome(s)
Variant at chrom XII, pos 941478 skipped: No unique matches found in tested genome(s)
Variant at chrom XII, pos 941479 skipped: No unique matches found in tested genome(s)
Variant at chrom XII, pos 941481 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 543689 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 543691 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 543697 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 543698 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 543704 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 1150870 skipped: No unique matches foun

Variant at chrom IV, pos 759494 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759495 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759500 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759512 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759524 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759526 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759632 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759633 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 759638 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 153588 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 153741 skipped: No unique matches found in tested genome(s)
Variant at chrom X, pos 153885 skipped: No unique matches found in 

In [12]:
# select variants by allele frequency (note difference from ergosterol variant selection step)
### remove singletons and doubletons
### if multiple alternate alleles, select most common alternate allele
### assign unitl variant ID prefix and number (use format VAR_XXXXX)

# candidate variants editable in all strains
gxg_variants_select = vcf.Reader(filename=gxg_variants_select_file)

# open output file
gxg_variants_select_AFfilter_file = gxg_variants_select_file.replace('.vcf', '_AFfilter.vcf')
gxg_variants_select_AFfilter = vcf.Writer(open(gxg_variants_select_AFfilter_file, 'w'), gxg_variants_select)

var_counter = {}
for record in gxg_variants_select:        
    # skip variant if all possible alternate allele count is less than 3
    if all([ac<3 for ac in record.INFO['AC']]):
        continue
    # if at least one alternate allele meets minimum count, select most common alternate allele to study
    else:
        i = np.argmax(record.INFO['AC']) # most common alt allele
        # update record entry
        record.ALT = [record.ALT[i]]
        record.INFO['AC'] = [record.INFO['AC'][i]]
        record.INFO['AF'] = [record.INFO['AF'][i]]
        record.INFO['MLEAC'] = [record.INFO['MLEAC'][i]]
        record.INFO['MLEAF'] = [record.INFO['MLEAF'][i]]
        
        # inspect TGT field and assign variant ID prefix
        ### if tgt field contains all 1 (i.e. targetable by all queried genomes), assign "GXG"
        ### otherwise, skip variant
        if all([i==1 for i in record.INFO['TGT']]):
            var_id_prefix = "GXG"
        else:
#             print('variant does not pass TGT check: skipping')
            continue
#             # assign unique prefix "GG-" with the last digit determined by converting binary to hexadecimal (works well for up to 4 genomes)
#             binary_str = ''.join([str(x) for x in record.INFO['TGT']])
#             hex_digit = hex(int(binary_str, 2))[2:].upper()
#             var_id_prefix = "GG{}".format(hex_digit)
            
        # add to counter
        if var_id_prefix not in var_counter:
            var_counter[var_id_prefix] = 1
        else:
            var_counter[var_id_prefix] += 1
        
        # add unique variant ID (follow format VAR_XXXXX)
        record.ID='{}_{:05d}'.format(var_id_prefix, var_counter[var_id_prefix])
        
        # write to output
        gxg_variants_select_AFfilter.write_record(record)
gxg_variants_select_AFfilter.close()


### Variant ID prefix and which genomes they can be edited in:
- GXG: BY | RM | YJM | YPS
- GGE: BY | RM | YJM | ---
- GGD: BY | RM | --- | YPS
- GGC: BY | RM | --- | ---
- GGB: BY | --- | YJM | YPS
- GGA: BY | --- | YJM | ---
- GG9: BY | --- | --- | YPS
- GG8: BY | --- | --- | ---

Only variants targetable in all 4 yeast strains (GXG prefix-ed) will be included in this library

## Submit gxg_variants_select_AFfilter VCF to VEP to get annotations, and download VEP output in TXT format
http://uswest.ensembl.org/Saccharomyces_cerevisiae/Tools/VEP

In [14]:
# add annotations to variants and remove variants that fall in genes not listed in interaction_rich_genes
gxg_variants_file = "gxg_variants.vcf"
gxg_variants_select_file = gxg_variants_file.replace('.vcf', '_select.vcf')

# get VEP annotations and remove variants that fall within genes not in approved list
gxg_variants_select_AFfilter_file = gxg_variants_select_file.replace('.vcf', '_AFfilter.vcf')
vep_output_filename = gxg_variants_select_AFfilter_file.replace('.vcf', '_VEPoutput.txt')
variants_annotated_filename = vep_output_filename.replace('_VEPoutput.txt', '_annotated.txt')
variants_annotated_df = annotate_variants_by_VEPoutput(gxg_variants_select_AFfilter_file, 
                                                       vep_output_filename, 
                                                       gff_file, 
                                                       variants_annotated_filename, 
                                                       id_colname='var_id')

# annotated file can be read in directly if annotation function has been run once before
variants_annotated_df = pd.read_csv(variants_annotated_filename, sep='\t', index_col=0)
display(variants_annotated_df)

# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
gxg_design_oligos = variants_annotated_df.query('(Gene_ID.isin(@interaction_rich_genes.index) | region.isin(@intergenic_regions_list))').copy()

##### (step not required if variants pre-filtered) ADDITIONALLY, since this set is focused on GxG, only include variants editable in at least 3 out of 4 genomes #####
# gxg_design_oligos = gxg_design_oligos.filter(regex='GXG|GGE|GGD|GGB', axis=0)

display(gxg_design_oligos)
# count variants according to which strains they can be edited in
display(pd.Series([x[:3] for x in gxg_design_oligos.index]).value_counts())


# write variants to gxg_design_oligos VCF as a preliminary list to design oligos for
gxg_design_oligos_vcf_file = gxg_variants_file.replace('.vcf', '_design_oligos_initial.vcf')
gxg_variants_select_AFfilter = vcf.Reader(filename=gxg_variants_select_AFfilter_file)
gxg_design_oligos_vcf = vcf.Writer(open(gxg_design_oligos_vcf_file, 'w'), gxg_variants_select_AFfilter)
for record in gxg_variants_select_AFfilter:
    if record.ID in gxg_design_oligos.index:
        gxg_design_oligos_vcf.write_record(record)
gxg_design_oligos_vcf.close()


parsing entry: 5000
parsing entry: 10000
parsing entry: 15000
parsing entry: 20000
parsing entry: 25000
parsing entry: 30000
parsing entry: 35000
parsing entry: 40000
parsing entry: 45000
parsing entry: 50000


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00001,I,41691,C,T,12,2022,0.005935,4208|486,1190|2719|4544,synonymous_variant,...,tcC/tcT,PANTHER:PTHR23244,,,,,,,,
GXG_00002,I,41705,A,C,4,2022,0.001978,4194|472,1176|2733|4558,missense_variant,...,gAt/gCt,PANTHER:PTHR23244,,,,,,,,
GXG_00003,I,41847,T,C,4,2022,0.001978,4052|330,1034|2875|4700,synonymous_variant,...,taT/taC,,,,,,,,,
GXG_00004,I,41850,A,G,45,2022,0.022000,4049|327,1031|2878|4703,synonymous_variant,...,ttA/ttG,,,,,,,,,
GXG_00005,I,42019,A,C,61,2022,0.030000,3880|158,862|118|3047|4872,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,118.0,PEX22,YAL055W,upstream_gene_variant,158.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50559,XVI,920406,C,A,4,2018,0.001982,1365|1454,3444|2503|3898,synonymous_variant,...,gcC/gcA,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50560,XVI,920425,G,A,18,2018,0.008920,1384|1435,3463|2484|3879,missense_variant,...,Gga/Aga,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50561,XVI,920442,A,G,90,2016,0.045000,1401|1418,3480|2467|3862,synonymous_variant,...,gtA/gtG,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50562,XVI,920446,G,A,16,2018,0.007929,1405|1414,3484|2463|3858,missense_variant,...,Gtc/Atc,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00005,I,42019,A,C,61,2022,0.030000,3880|158,862|118|3047|4872,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,118.0,PEX22,YAL055W,upstream_gene_variant,158.0
GXG_00006,I,42023,C,T,12,2020,0.005941,3876|154,858|122|3051|4876,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,122.0,PEX22,YAL055W,upstream_gene_variant,154.0
GXG_00007,I,42033,A,AT,60,2012,0.030000,3865|143,847|132|3061|4886,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,132.0,PEX22,YAL055W,upstream_gene_variant,143.0
GXG_00008,I,42036,TTGG,CTGG,6,2016,0.002976,3860|138,842|135|3064|4889,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,135.0,PEX22,YAL055W,upstream_gene_variant,138.0
GXG_00009,I,42037,TGG,T,30,2022,0.015000,3860|138,842|137|3066|4891,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,137.0,PEX22,YAL055W,upstream_gene_variant,138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50559,XVI,920406,C,A,4,2018,0.001982,1365|1454,3444|2503|3898,synonymous_variant,...,gcC/gcA,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50560,XVI,920425,G,A,18,2018,0.008920,1384|1435,3463|2484|3879,missense_variant,...,Gga/Aga,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50561,XVI,920442,A,G,90,2016,0.045000,1401|1418,3480|2467|3862,synonymous_variant,...,gtA/gtG,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50562,XVI,920446,G,A,16,2018,0.007929,1405|1414,3484|2463|3858,missense_variant,...,Gtc/Atc,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,


GXG    44835
dtype: int64

## Design oligos for initial list, then filter down to variants with at least 2 oligos designed

In [192]:
gxg_oligo_file = os.path.expanduser("~/crispey3/initial_design/Output/all_SNPs_gxg_initial_GG_9bp_OLIGO.tab")
gxg_oligo_df = pd.read_csv(gxg_oligo_file, sep='\t')

gxg_design_oligos = pd.read_csv("/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/gxg_variants_select_AFfilter_annotated.txt", sep='\t', index_col=0)
# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
gxg_design_oligos = gxg_design_oligos.query('(Gene_ID.isin(@interaction_rich_genes.index) | region.isin(@intergenic_regions_list))').copy()
display(gxg_design_oligos)

gxg_design_oligos['num_of_oligos'] = gxg_oligo_df.groupby('var_id').apply(len)
# select oligos that have at least 2 oligos designed
gxg_oligos_final = gxg_design_oligos.query('num_of_oligos>1').copy()
# select only variants targetable in all 4 yeast strains
gxg_oligos_final = gxg_oligos_final.filter(regex='GXG', axis=0)

display(gxg_oligos_final)


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00005,I,42019,A,C,61,2022,0.030000,3880|158,862|118|3047|4872,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,118.0,PEX22,YAL055W,upstream_gene_variant,158.0
GXG_00006,I,42023,C,T,12,2020,0.005941,3876|154,858|122|3051|4876,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,122.0,PEX22,YAL055W,upstream_gene_variant,154.0
GXG_00007,I,42033,A,AT,60,2012,0.030000,3865|143,847|132|3061|4886,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,132.0,PEX22,YAL055W,upstream_gene_variant,143.0
GXG_00008,I,42036,TTGG,CTGG,6,2016,0.002976,3860|138,842|135|3064|4889,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,135.0,PEX22,YAL055W,upstream_gene_variant,138.0
GXG_00009,I,42037,TGG,T,30,2022,0.015000,3860|138,842|137|3066|4891,unidirectional_promoter,...,,,GPB2,YAL056W,downstream_gene_variant,137.0,PEX22,YAL055W,upstream_gene_variant,138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50559,XVI,920406,C,A,4,2018,0.001982,1365|1454,3444|2503|3898,synonymous_variant,...,gcC/gcA,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50560,XVI,920425,G,A,18,2018,0.008920,1384|1435,3463|2484|3879,missense_variant,...,Gga/Aga,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50561,XVI,920442,A,G,90,2016,0.045000,1401|1418,3480|2467|3862,synonymous_variant,...,gtA/gtG,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,
GXG_50562,XVI,920446,G,A,16,2018,0.007929,1405|1414,3484|2463|3858,missense_variant,...,Gtc/Atc,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00006,I,42023,C,T,12,2020,0.005941,3876|154,858|122|3051|4876,unidirectional_promoter,...,,GPB2,YAL056W,downstream_gene_variant,122.0,PEX22,YAL055W,upstream_gene_variant,154.0,2.0
GXG_00007,I,42033,A,AT,60,2012,0.030000,3865|143,847|132|3061|4886,unidirectional_promoter,...,,GPB2,YAL056W,downstream_gene_variant,132.0,PEX22,YAL055W,upstream_gene_variant,143.0,2.0
GXG_00012,I,42184,C,T,5,2022,0.002473,3715,697|283|3212,missense_variant,...,,,,,,,,,,2.0
GXG_00013,I,42186,C,T,59,2022,0.029000,3713,695|285|3214,missense_variant,...,,,,,,,,,,3.0
GXG_00014,I,42452,C,T,8,2022,0.003956,3447,429|551|3480,synonymous_variant,...,"Pfam:PF12827,Gene3D:3.40.50.11730",,,,,,,,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50554,XVI,920332,A,G,29,2014,0.014000,1291|1528,3370|2577|3972,missense_variant,...,"PANTHER:PTHR11851:SF191,PANTHER:PTHR11851,Gene...",,,,,,,,,3.0
GXG_50556,XVI,920361,C,T,6,2018,0.002973,1320|1499,3399|2548|3943,synonymous_variant,...,"PANTHER:PTHR11851:SF191,PANTHER:PTHR11851,Gene...",,,,,,,,,2.0
GXG_50558,XVI,920384,C,A,4,2018,0.001982,1343|1476,3422|2525|3920,missense_variant,...,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,,2.0
GXG_50559,XVI,920406,C,A,4,2018,0.001982,1365|1454,3444|2503|3898,synonymous_variant,...,"Gene3D:3.30.830.10,Superfamily:SSF63411",,,,,,,,,2.0


## From this list, I will select the final set of 1000 common and 1000 rare variants (~5k oligos) based on the following criteria:
1. Determine max number of oligos allowable in library (e.g. 5000 oligos)
2. Estimate number of variants allowed (5000/2.5 = 2000 variants)
3. Set AF cutoff for what variants classify as "common" (e.g. 0.02, which is ~40 allele counts in pool of 2000).
4. Rank genes by number of common variants, followed by number of rare variants (so among genes with an equal number of common variants, we select genes which have "ultra-rare" variants to sample).
5. For each gene, sort the variants by allele frequency then allele count, in descending order. Select and count common variants that meet allele frequency cutoff. Then select an equal number of top rare variants to match common variants.
6. Consolidate selected variants and associated genes, check final number of genes, variants and oligos to ensure they fit library specs

In [193]:
# create a dataframe of hub genes with annotations and number of variants (common, rare)

left_limit=500 # distance from left of gene to search
right_limit=500 # distance from right of gene to search

genes_df = interaction_rich_genes.copy()
genes_df.loc[:,'chrom'] = genes_df['chrom'].str.replace('chr', '')
genes_df.loc[:,'left_limit'] = genes_df['start'] - left_limit
genes_df.loc[:,'right_limit'] = genes_df['end'] + right_limit
genes_df.loc[:, 'region_length'] = genes_df['right_limit'] - genes_df['left_limit'] + 1

# associate each variant with genes of interest if it falls within gene limits
gxg_oligos_final.loc[:,'assoc_gene'] = gxg_oligos_final.apply(lambda x: genes_df.query('chrom==@x.CHROM & left_limit<=@x.POS & right_limit>=@x.POS').index.tolist(), axis=1)
# remove variants without gene associations
gxg_oligos_final = gxg_oligos_final[gxg_oligos_final.assoc_gene.apply(len)>0]
# for coding variants with multiple associations (i.e. genes of interest adjacent to each other),
# adjust to associate only gene whose coding region contains variant
selection = (gxg_oligos_final.assoc_gene.apply(len)>1) & (~gxg_oligos_final.region.isin(intergenic_regions_list))
gxg_oligos_final.loc[selection, 'assoc_gene'] = gxg_oligos_final.loc[selection, 'Gene_ID'].apply(lambda x: [x])

# # finally, exclude remaining variants that are associated with more than one gene (should be only noncoding variants)
# gxg_oligos_final = gxg_oligos_final[gxg_oligos_final['assoc_gene'].apply(lambda x: len(x)==1)]

# count the number of variants associated with each gene
af_cutoff_common=0.02
af_cutoff_rare=0.002
genes_df.loc[:, 'num_of_vars'] = [gxg_oligos_final['assoc_gene'].apply(lambda x: gene in x).sum() for gene in genes_df.index]
genes_df.loc[:, 'num_of_vars_common'] = [gxg_oligos_final.query('AF>=@af_cutoff_common')['assoc_gene'].apply(lambda x: gene in x).sum() for gene in genes_df.index]
genes_df.loc[:, 'num_of_vars_rare'] = [gxg_oligos_final.query('AF<=@af_cutoff_rare')['assoc_gene'].apply(lambda x: gene in x).sum() for gene in genes_df.index]

# estimate density of variants (num of variants per 100bp) surveyed in each gene_region
genes_df.loc[:, 'var_density'] = genes_df['num_of_vars'] / genes_df['region_length'] * 100

# sort by number of common variants
genes_df = genes_df.sort_values(['num_of_vars_common', 'num_of_vars_rare'], ascending=False)

display(genes_df)

Unnamed: 0_level_0,positive_interactions,negative_interactions,chrom,start,end,strand,gene_name,description,essential,paralog,KA,left_limit,right_limit,region_length,num_of_vars,num_of_vars_common,num_of_vars_rare,var_density
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
YNL262W,252,319,XIV,148212,154880,+,POL2,Catalytic subunit of DNA polymerase (II) epsil...,True,False,,147712,155380,7669,99,24,15,1.290911
YPL231W,348,548,XVI,108652,114315,+,FAS2,"Alpha subunit of fatty acid synthetase, which ...",True,False,,108152,114815,6664,99,21,21,1.485594
YMR207C,242,298,XIII,677193,683564,-,HFA1,"Mitochondrial acetyl-coenzyme A carboxylase, c...",False,True,0.366,676693,684064,7372,81,19,22,1.098752
YGR097W,159,370,VII,678695,682135,+,ASK10,"Component of RNA polymerase II holoenzyme, pho...",False,True,0.561,678195,682635,4441,52,19,12,1.170907
YFL033C,276,396,VI,69115,74427,-,RIM15,Glucose-repressible protein kinase involved in...,False,False,,68615,74927,6313,61,18,12,0.966260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YDL248W,300,344,IV,1802,2953,+,COS7,"Protein of unknown function, member of the DUP...",False,False,,1302,3453,2152,0,0,0,0.000000
YBR181C,305,338,II,591712,592774,-,RPS6B,Protein component of the small (40S) ribosomal...,False,True,0.000,591212,593274,2063,0,0,0,0.000000
YIR005W,275,346,IX,364889,365335,+,IST3,"Component of the U2 snRNP, required for the fi...",False,False,,364389,365835,1447,2,0,0,0.138217
YMR256C,176,355,XIII,778945,779127,-,COX7,"Subunit VII of cytochrome c oxidase, which is ...",False,False,,778445,779627,1183,1,0,0,0.084531


In [194]:
# maximum number of oligos to design for: 5000
# let's aim for 2000 variants (1000 common, 1000 rare), in as few genes as possible
cutoff = 2000

var_list = []
gene_list = []
exclude = []
for gene in genes_df.index:
    # skip blacklisted genes in exclusion list
    if gene in exclude:
        print(gene, 'is excluded')
        continue
        
    df = gxg_oligos_final[gxg_oligos_final['assoc_gene'].apply(lambda x: gene in x)].sort_values(['AF', 'AC'], ascending=False)
    com_var = df.query('AF>=@af_cutoff_common').index.tolist()
    # check if number of common variants makes up >50% of all targetable variants in gene (unlikely, but possible)
    # if so, reduce common variants selected to 50% of all targetable variants in this gene -- ensures balanced representation of common vs rare alleles
    if len(com_var) > len(df)//2:
        print("Common variants ({}) are >50% of all variants ({}). Capping common variants to {}".format(len(com_var), len(df), len(df)//2))
        com_var = com_var[:len(df)//2]        
    rar_var = df.tail(len(com_var)).index.tolist()
    com_rar_var = com_var + rar_var
    
    # check if selected variants are associated with genes other than the one being parsed
    # if so, add those other genes to the exclude list to reduce overlapping variants
    other_genes = set([g for l in df.loc[com_rar_var, 'assoc_gene'] for g in l if g != gene])
    if len(other_genes)>0:
        print(other_genes, "added to exclude list")
        exclude += list(other_genes)
    
    # add selected variants to var_list
    var_list += com_rar_var
    # add gene to gene_list
    gene_list.append(gene)
    
    if len(var_list)>= cutoff:
        break

# number of genes selected
hub_genes = genes_df.loc[gene_list]
print('Number of hub genes selected:', len(hub_genes))

# select final list of variants that will be targeted
gxg_oligos_final = gxg_oligos_final.query('var_id.isin(@var_list)')
display(gxg_oligos_final)

# distribution of variant annotation types
display((gxg_oligos_final.region.value_counts()/len(gxg_oligos_final)))

# how many oligos are we expecting from this list of variants?
print("Number of oligos from gxg gene set:", int(gxg_oligos_final.num_of_oligos.sum()))


{'YLR086W'} added to exclude list
{'YOL005C'} added to exclude list
{'YDR126W'} added to exclude list
{'YCR094W'} added to exclude list
{'YJR060W'} added to exclude list
YJR060W is excluded
{'YPR162C'} added to exclude list
{'YLR086W'} added to exclude list
{'YCR034W'} added to exclude list
{'YBL008W'} added to exclude list
{'YDR335W'} added to exclude list
{'YFL037W', 'YFL039C'} added to exclude list
{'YML127W'} added to exclude list
{'YML071C'} added to exclude list
YDR335W is excluded
YML127W is excluded
{'YLL036C'} added to exclude list
{'YDL035C'} added to exclude list
YDL035C is excluded
{'YDL002C'} added to exclude list
{'YAL025C'} added to exclude list
YLL036C is excluded
{'YPR182W'} added to exclude list
YCR094W is excluded
Number of hub genes selected: 103


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos,assoc_gene
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXG_00321,I,101530,G,C,4,2022,0.001978,385|1833,1662|4742|35,unidirectional_promoter,...,MAK16,YAL025C,upstream_gene_variant,385.0,LTE1,YAL024C,downstream_gene_variant,35.0,3.0,"[YAL024C, YAL025C]"
GXG_00329,I,101816,C,T,4,2020,0.001980,671|2119,1948|4456,missense_variant,...,,,,,,,,,4.0,[YAL024C]
GXG_00332,I,101901,C,A,90,2022,0.045000,756|2204,2033|4371,synonymous_variant,...,,,,,,,,,2.0,[YAL024C]
GXG_00339,I,102261,C,T,78,2020,0.039000,1116|2564,2393|4011,synonymous_variant,...,,,,,,,,,2.0,[YAL024C]
GXG_00346,I,102699,G,A,310,2016,0.154000,1554|3002,2831|3573,synonymous_variant,...,,,,,,,,,3.0,[YAL024C]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXG_50437,XVI,899389,G,A,6,2018,0.002973,3625|805|1366|2655,2385,synonymous_variant,...,,,,,,,,,2.0,[YPR181C]
GXG_50440,XVI,899419,A,G,196,2014,0.097000,3655|775|1336|2625,2415,synonymous_variant,...,,,,,,,,,3.0,[YPR181C]
GXG_50446,XVI,899521,G,C,3,2018,0.001487,3757|673|1234|2523,2517,synonymous_variant,...,,,,,,,,,2.0,[YPR181C]
GXG_50447,XVI,899679,A,G,68,2010,0.034000,3915|12|515|1076|2365,2675,bidirectional_promoter,...,SEC23,YPR181C,upstream_gene_variant,12.0,SMX3,YPR182W,upstream_gene_variant,515.0,4.0,[YPR181C]


synonymous_variant         0.4395
missense_variant           0.3470
unidirectional_promoter    0.1020
bidirectional_promoter     0.0835
intergenic                 0.0175
inframe_insertion          0.0030
intron_variant             0.0030
inframe_deletion           0.0025
stop_gained                0.0010
frameshift_variant         0.0010
Name: region, dtype: float64

Number of oligos from gxg gene set: 4957


In [197]:
# write gxg_oligos_final table to file
gxg_oligos_final.to_csv("/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/gxg_variants_final.txt", sep='\t')

# use this file in separate script to organize into pools

In [199]:
interaction_rich_genes.query('gene_name=="LTE1"')

Unnamed: 0_level_0,positive_interactions,negative_interactions,chrom,start,end,strand,gene_name,description,essential,paralog,KA
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
YAL024C,189,412,chrI,101565,105872,-,LTE1,Protein similar to GDP/GTP exchange factors bu...,False,False,


## Summary of GXG variants

In [195]:
### SUMMARY OF GXG SET ###
print("Number of variants:", len(gxg_oligos_final))
print("Number of oligos:", int(sum(gxg_oligos_final.num_of_oligos)))
print()
print("Distribution of (common) variants among hub genes")
# print(gxg_oligos_final.Gene_ID.value_counts().filter(hub_genes.index).describe())
print(hub_genes.num_of_vars_common.describe())
print()
print("Distribution of oligos among GxG variants")
print(gxg_oligos_final.num_of_oligos.describe())
print()
print("Proportion of variant annotation classes")
print(gxg_oligos_final.region.value_counts()/len(gxg_oligos_final))


Number of variants: 2000
Number of oligos: 4957

Distribution of (common) variants among hub genes
count    103.000000
mean       9.708738
std        3.285831
min        7.000000
25%        8.000000
50%        9.000000
75%       11.000000
max       24.000000
Name: num_of_vars_common, dtype: float64

Distribution of oligos among GxG variants
count    2000.000000
mean        2.478500
std         0.740143
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: num_of_oligos, dtype: float64

Proportion of variant annotation classes
synonymous_variant         0.4395
missense_variant           0.3470
unidirectional_promoter    0.1020
bidirectional_promoter     0.0835
intergenic                 0.0175
inframe_insertion          0.0030
intron_variant             0.0030
inframe_deletion           0.0025
stop_gained                0.0010
frameshift_variant         0.0010
Name: region, dtype: float64


## Determine number of pools to use for GxG oligos

In [196]:
# count OLIGOS in each variant prefix set
display(gxg_oligos_final.groupby(lambda x: x[:3]).num_of_oligos.sum().sort_values(ascending=False))

# determine number of pools to use
num_of_oligos = int(gxg_oligos_final.num_of_oligos.sum())
min_pool_size = 109 # allows 12 technical oligos
max_pool_size = 118 # allows 3 technical oligos
complete_pool_size = 121 # maximum oligos synthesized per pool

# find number of pools to use
find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size)

GXG    4957.0
Name: num_of_oligos, dtype: float64

Total number of oligos to fit: 4957


(42, 45)

Use 42 pools:
Number of oligos per pool: 118
Number of oligos leftover: 1
Number of technical oligos: 126

Use 43 pools:
Number of oligos per pool: 115
Number of oligos leftover: 12
Number of technical oligos: 258

Use 44 pools:
Number of oligos per pool: 112
Number of oligos leftover: 29
Number of technical oligos: 396

Use 45 pools:
Number of oligos per pool: 110
Number of oligos leftover: 7
Number of technical oligos: 495



array([42, 43, 44, 45])

### Variant ID prefix and which genomes they can be edited in:
- GXG: BY | RM | YJM | YPS

### We should be able to fit GxG variants in 42 pools
- GXG: 42 pools, 117-118 oligos each - 4,946 oligos


# Selecting variants from previous CRISPEY screen with known effects
These oligos showed effects in initial screen in 4 yeast strains, and are good candidates for demonstrating GxG fitness effects (e.g. GAT2 frameshift mutation)

In [127]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/epistasis/"

# path to oligo-variant info table (taken from CRISPEY library creation sheet)
oligo_var_path = working_dir+"library_oligo_to_variant_v2_withAnno.txt"

# path to epistais set variants VCF file
epi_vcf_file = '/home/users/rang/tmp/crispr/forLibDesign_170918.vcf'
# path to promoter set variants VCF file
prom_vcf_file = '/home/users/rang/tmp/crispr/forLibDesign_promoter625.vcf'

# path to GxG variants final set (check to avoid duplication)
gxg_oligos_final_file = "/home/users/rang/scratch/yeast/genetic_interactions/costanzo_2016/gxg_variants_final.txt"

os.chdir(working_dir)

In [128]:
# load oligo-variant info table
oligo_info_df = pd.read_csv(oligo_var_path, sep='\t', header = 0, index_col=0, low_memory=False)
oligo_info_df = oligo_info_df.reset_index().rename(columns={'oligo_id':'oligo_num'})
oligo_info_df.index = oligo_info_df.oligo_num.astype('str')+"#"+oligo_info_df.set_name
oligo_info_df.index.name = 'oligo_id'
var_info_df = oligo_info_df.loc[:,['SNP_id', 'chrom', 'SNP_chr_pos', 'REF', 'ALT', 'SNPEff_Annotation', 'SNPEff_Annotation_Impact', 'SNPEff_Gene_Name', 'SNPEff_Gene_ID', 'SNPEff_region']].copy().drop_duplicates().set_index('SNP_id') #.reset_index(drop=True)#
var_info_df.index.name = 'var_id'
display(var_info_df)

Unnamed: 0_level_0,chrom,SNP_chr_pos,REF,ALT,SNPEff_Annotation,SNPEff_Annotation_Impact,SNPEff_Gene_Name,SNPEff_Gene_ID,SNPEff_region
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
RAR_13-C1,I,42183,C,[T],missense_variant,MODERATE,PEX22,YAL055W,missense_variant
RAR_13,I,42184,C,[T],missense_variant,MODERATE,PEX22,YAL055W,missense_variant
RAR_21-C1,I,50078,C,[T],synonymous_variant,LOW,OAF1,YAL051W,synonymous_variant
RAR_21,I,50081,C,[T],synonymous_variant,LOW,OAF1,YAL051W,synonymous_variant
COM_8,I,57455,C,[T],downstream_gene_variant,MODIFIER,,YAL045C,unidirectional promoter
...,...,...,...,...,...,...,...,...,...
PROM_1871,XII,819307,T,[C],upstream_gene_variant,MODIFIER,RPL26A,YLR344W,unidirectional promoter
PROM_1872,XII,819308,C,[T],upstream_gene_variant,MODIFIER,RPL26A,YLR344W,unidirectional promoter
PROM_1873,XII,819309,A,[G],upstream_gene_variant,MODIFIER,RPL26A,YLR344W,unidirectional promoter
PROM_1874,XII,819310,G,[A],upstream_gene_variant,MODIFIER,RPL26A,YLR344W,unidirectional promoter


### Select TDH3 promoter oligos targeting variants with existing data
These variants have data published by the Wittkopp lab and are useful for calibrating CRISPEY fitness readouts
- fitness effects
- 3 TFBS locations
- expression data

Ensure each variant is targeted by 2 or more oligos whose edit position in guide is between -9 and +2.

In [129]:
# load Wittkopp lab expression and fitness data on TDH3 promoter variants
# Wittkopp fitness data
wittkopp_fitness_df = pd.read_csv('tdh3_fitness_wittkopp.txt', sep='\t', index_col=0)

# Wittkopp expression data
wittkopp_expr_df = pd.read_csv('tdh3_expression_wittkopp.txt', sep='\t')
# keep variants that reduced expression by >2%
wittkopp_expr_df = wittkopp_expr_df[wittkopp_expr_df['YFP.MEAN.RELATIVE']<0.98].iloc[:,[6,3]]


# get TDH3 oligos from previous screen
promoter_name = 'TDH3'
promoter_chrom = 'VII'
promoter_cds_start = 883810
promoter_strand = 1 # positive strand
tdh3_oligo_info = oligo_info_df.loc[(oligo_info_df.SNP_id.str.contains('PROM_')) & (oligo_info_df.chrom==promoter_chrom)].copy() #['SNP_id', 'chrom', 'SNP_chr_pos', 'REF', 'ALT', 'SNPEff_Gene_Name', 'SNPEff_Gene_ID', 'SNPEff_region', 'SNP_pos_in_guide', 'guide_strand']
tdh3_oligo_info['relative_pos'] = (promoter_cds_start - tdh3_oligo_info.SNP_chr_pos)*promoter_strand
# tdh3_oligo_info.rename(columns={'SNP_id':'var_id'}, inplace=True)


# filter out oligos where SNP_pos_in_guide < -10
tdh3_oligo_info = tdh3_oligo_info.query('SNP_pos_in_guide>=-9')

# keep only variants targeted by 2 or more oligos
tdh3_oligo_info = tdh3_oligo_info.groupby('SNP_id').filter(lambda x: len(x)>1)

# select oligos targeting variants in known TFBS and/or have fitness data and/or have >2% reduction in TDH3 expression
tdh3_oligo_info = tdh3_oligo_info.query('(relative_pos<=-441 & relative_pos>=-449) | \
                               (relative_pos<=-479 & relative_pos>=-487) | \
                               (relative_pos<=-502 & relative_pos>=-510) | \
                               (relative_pos.isin(@wittkopp_fitness_df.Position)) | \
                               (relative_pos.isin(@wittkopp_expr_df.POSITION))')
tdh3_oligo_info = tdh3_oligo_info.drop('relative_pos', axis=1)
display(tdh3_oligo_info)
display(tdh3_oligo_info.shape)

# retrieve variants from vcf file
prom_vcf = vcf.Reader(filename=prom_vcf_file)
tdh3_var_list = []
tdh_num=0
for record in prom_vcf:
    if record.ID in tdh3_oligo_info.SNP_id.values:
        tdh_num+=1
        # append old record ID into INFO
        record.INFO['OVID'] = record.ID
        # add new variant ID to epi_oligo_df and rename record ID
        new_var_id = 'TDH_{:05d}'.format(tdh_num)
        tdh3_oligo_info.loc[tdh3_oligo_info.SNP_id==record.ID,'var_id'] = new_var_id
        record.ID = new_var_id
        # add VCF record to list
        tdh3_var_list.append(record)
tdh3_var_list = pd.Series(tdh3_var_list)

Unnamed: 0_level_0,oligo_num,oligo_seq,set_name,set_priority,chrom,SNP_chr_pos,REF,ALT,SNP_id,guide_chr_pos,...,SNPEff_CDS_pos,SNPEff_AA_pos,VEP_Amino_acids,VEP_Codons,VEP_HGVSc,VEP_HGVSp,VEP_SWISSPROT,VEP_UNIPARC,VEP_DOMAINS,VEP_BLOSUM62
oligo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28474#promoterMut,28474,GAGTTACTGTCTGTTTTCCTCAGAGAGAGCAGAAGGTAATAGAAGG...,promoterMut,5,VII,884068,C,[T],PROM_883,884073,...,,,,,,,P00359,UPI000004F91A,,
28475#promoterMut,28475,GAGTTACTGTCTGTTTTCCTAGAGAGAGCAGAAGGTAATAGAAGGT...,promoterMut,5,VII,884068,C,[T],PROM_883,884074,...,,,,,,,P00359,UPI000004F91A,,
28484#promoterMut,28484,GAGTTACTGTCTGTTTTCCTCAGAGAGAGCAGAAGGTAATAGAAGG...,promoterMut,5,VII,884073,G,[A],PROM_888,884073,...,,,,,,,P00359,UPI000004F91A,,
28485#promoterMut,28485,GAGTTACTGTCTGTTTTCCTAGAGAGAGCAGAAGGTAATAGAAGGT...,promoterMut,5,VII,884073,G,[A],PROM_888,884074,...,,,,,,,P00359,UPI000004F91A,,
28539#promoterMut,28539,GAGTTACTGTCTGTTTTCCTAAAATGAGATAGATACATGCGTGGGT...,promoterMut,5,VII,884105,C,[T],PROM_920,884105,...,,,,,,,P00359,UPI000004F91A,,
28540#promoterMut,28540,GAGTTACTGTCTGTTTTCCTTGAGATAGATACATGCGTGGGTCAAT...,promoterMut,5,VII,884105,C,[T],PROM_920,884109,...,,,,,,,P00359,UPI000004F91A,,
28541#promoterMut,28541,GAGTTACTGTCTGTTTTCCTGGCACAAACAGGCAAAAAACGGGCAC...,promoterMut,5,VII,884105,C,[T],PROM_920,884105,...,,,,,,,P00359,UPI000004F91A,,
28544#promoterMut,28544,GAGTTACTGTCTGTTTTCCTAAAATGAGATAGATACATGCGTGGGT...,promoterMut,5,VII,884107,G,[A],PROM_922,884105,...,,,,,,,P00359,UPI000004F91A,,
28545#promoterMut,28545,GAGTTACTGTCTGTTTTCCTTGAGATAGATACATGCGTGGGTCAAT...,promoterMut,5,VII,884107,G,[A],PROM_922,884109,...,,,,,,,P00359,UPI000004F91A,,
28546#promoterMut,28546,GAGTTACTGTCTGTTTTCCTGGCACAAACAGGCAAAAAACGGGCAC...,promoterMut,5,VII,884107,G,[A],PROM_922,884105,...,,,,,,,P00359,UPI000004F91A,,


(42, 42)

### Select variants from previous screen that show fitness effects (validation sets)
These variants are potentially useful for detecting GxG effects. Fitness effect was calculated from stacked barcode counts fitted into limma-voom (limma ran on 4 strains together, gen logFC estimated for 18.5k oligos meeting 10k read cutoff across 4 strains)

Selection criteria:
- Significant hit in at least 3 strains (adjusted p-value<0.05) - test set for measuring GxG
- Very large effect in one strain (abs(logFC)>0.01) - strain-specific effect validation

Ensure each variant is targeted by 2 or more oligos whose edit position in guide is between -9 and +2.

In [130]:
g_list = ['BY', 'RM', 'YJM', 'YPS']
hits_df = []
for g in g_list:
    limma_output_file= working_dir+"limma/4-indiv-comp-oct2019/limma_results_{}_stacked_070820.txt".format(g)
    var_fitness_df = pd.read_csv(limma_output_file, sep='\t', index_col=0, header=0)
    var_fitness_df.index.name = 'oligo_id'
    # annotate variant fitness file
    var_fitness_df = var_fitness_df.merge(oligo_info_df[['set_name', 'SNP_id', 'chrom', 'SNP_chr_pos', 'SNP_pos_in_guide', 'guide_strand', 'SNPEff_region']], left_on='oligo_id', right_index=True, how='inner').sort_index()
    var_fitness_df.rename(columns={'SNP_id':'var_id', 'P.Value':'pval', 'adj.P.Val':'padj'}, inplace=True)
    # extract hits from epistasis main set
    df = var_fitness_df.query('padj<0.05 & oligo_id.str.contains("SNP_avoidExcludedSeqs")').sort_values('logFC', ascending=True)[['var_id', 'logFC', 'padj']].rename(columns={'logFC':'logFC_{}'.format(g), 'padj':'padj_{}'.format(g)}).reset_index()
    hits_df.append(df)

hits_df = pd.concat([x.set_index(['oligo_id', 'var_id']) for x in hits_df], axis=1, join='outer').reset_index(level=1)

In [131]:
# Validation set 1: significant hits in at least 3 strains
set1 = hits_df[hits_df.isna().sum(axis=1)<=2]
# filter out oligos where SNP_pos_in_guide < -10
sig_hits_set = oligo_info_df.query('SNP_id.isin(@set1.var_id) & SNP_pos_in_guide>-10').copy()
# keep only variants targeted by 2 or more oligos
sig_hits_set = sig_hits_set.groupby('SNP_id').filter(lambda x: len(x)>1)

# display(set1)
# display(sig_hits_set)

# Validation set 2: large effects observed in at least one strain NOT IN SET 1
# effect size cut off is arbitrary - picked to keep final number of oligos manageable
set2 = hits_df[(hits_df.filter(regex=("logFC_*")).abs()>0.0405).any(axis=1)].query('~var_id.isin(@set1.var_id)')
# filter out oligos where SNP_pos_in_guide < -10
large_effect_set = oligo_info_df.query('SNP_id.isin(@set2.var_id) & SNP_pos_in_guide>-10').copy()
# keep only variants targeted by 2 or more oligos
large_effect_set = large_effect_set.groupby('SNP_id').filter(lambda x: len(x)>1)
# display(set2)
# display(large_effect_set)

# set 1 and set 2 oligos
epi_oligo_df = pd.concat([sig_hits_set, large_effect_set])


# read in gxg_oligos_final to check for dups
gxg_oligos_final = pd.read_csv(gxg_oligos_final_file, sep='\t', index_col=0)

# retrieve variants from vcf file
epi_vcf = vcf.Reader(filename=epi_vcf_file)
epi_var_list = []
val_num=0
for record in epi_vcf:
    if (record.ID in epi_oligo_df.SNP_id.values):
        # add variant if it is not in main GxG set
        if gxg_oligos_final.query('CHROM==@record.CHROM & POS==@record.POS & ALT==@record.ALT[0]').empty:
            val_num+=1
            # append old record ID into INFO
            record.INFO['OVID'] = record.ID
            # add new variant ID to epi_oligo_df and rename record ID
            new_var_id = 'VAL_{:05d}'.format(val_num)
            epi_oligo_df.loc[epi_oligo_df.SNP_id==record.ID,'var_id'] = new_var_id
            record.ID = new_var_id
            # add VCF record to list
            epi_var_list.append(record)
        # skip variant if already in GxG set
        else:
            print('Skipping variant {}: already in GxG set'.format(record))
            # remove oligos targeting this variant
            epi_oligo_df = epi_oligo_df.loc[epi_oligo_df['SNP_id']!=record.ID,:]

epi_var_list = pd.Series(epi_var_list)

Skipping variant Record(CHROM=IV, POS=319482, REF=G, ALT=[A]): already in GxG set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Skipping variant Record(CHROM=V, POS=88751, REF=C, ALT=[T]): already in GxG set
Skipping variant Record(CHROM=VI, POS=74406, REF=C, ALT=[T]): already in GxG set
Skipping variant Record(CHROM=VII, POS=619256, REF=G, ALT=[A]): already in GxG set
Skipping variant Record(CHROM=IX, POS=112910, REF=C, ALT=[T]): already in GxG set
Skipping variant Record(CHROM=X, POS=519926, REF=G, ALT=[A]): already in GxG set
Skipping variant Record(CHROM=XIII, POS=682727, REF=C, ALT=[T]): already in GxG set


In [132]:
# combine selected TDH3 promoter oligos and epi set oligos
known_effects_oligo_df = pd.concat([tdh3_oligo_info, epi_oligo_df])
display(known_effects_oligo_df)

# write selected TDH3 promoter variants and epi set variants to VCF, run on oligo design pipeline
known_effects_var_list = pd.concat([tdh3_var_list, epi_var_list]).reset_index(drop=True)
known_effects_vcf_file = '/home/users/rang/scratch/yeast/epistasis/epival_variants_final.vcf'
epi_vcf = vcf.Reader(filename=epi_vcf_file)
known_effects_vcf = vcf.Writer(open(known_effects_vcf_file, 'w'), epi_vcf)
for record in known_effects_var_list:
    known_effects_vcf.write_record(record)
known_effects_vcf.close()

Unnamed: 0_level_0,oligo_num,oligo_seq,set_name,set_priority,chrom,SNP_chr_pos,REF,ALT,SNP_id,guide_chr_pos,...,SNPEff_AA_pos,VEP_Amino_acids,VEP_Codons,VEP_HGVSc,VEP_HGVSp,VEP_SWISSPROT,VEP_UNIPARC,VEP_DOMAINS,VEP_BLOSUM62,var_id
oligo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28474#promoterMut,28474,GAGTTACTGTCTGTTTTCCTCAGAGAGAGCAGAAGGTAATAGAAGG...,promoterMut,5,VII,884068,C,[T],PROM_883,884073,...,,,,,,P00359,UPI000004F91A,,,TDH_00001
28475#promoterMut,28475,GAGTTACTGTCTGTTTTCCTAGAGAGAGCAGAAGGTAATAGAAGGT...,promoterMut,5,VII,884068,C,[T],PROM_883,884074,...,,,,,,P00359,UPI000004F91A,,,TDH_00001
28484#promoterMut,28484,GAGTTACTGTCTGTTTTCCTCAGAGAGAGCAGAAGGTAATAGAAGG...,promoterMut,5,VII,884073,G,[A],PROM_888,884073,...,,,,,,P00359,UPI000004F91A,,,TDH_00002
28485#promoterMut,28485,GAGTTACTGTCTGTTTTCCTAGAGAGAGCAGAAGGTAATAGAAGGT...,promoterMut,5,VII,884073,G,[A],PROM_888,884074,...,,,,,,P00359,UPI000004F91A,,,TDH_00002
28539#promoterMut,28539,GAGTTACTGTCTGTTTTCCTAAAATGAGATAGATACATGCGTGGGT...,promoterMut,5,VII,884105,C,[T],PROM_920,884105,...,,,,,,P00359,UPI000004F91A,,,TDH_00003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24211#SNP_avoidExcludedSeqs,24211,GAGTTACTGTCTGTTTTCCTGATGTCATCCTATAATTGTTTCCGTA...,SNP_avoidExcludedSeqs,0,XVI,563805,G,[A],RAR_14853-C1,563813,...,,,,,,P53394,UPI0000136210,,,VAL_00240
24791#SNP_avoidExcludedSeqs,24791,GAGTTACTGTCTGTTTTCCTTGTCATGATTTACTAAAGCTAACCAT...,SNP_avoidExcludedSeqs,0,XVI,820797,C,[T],COM_4811,820797,...,209,Q,caG/caA,YPR144C_mRNA.:c.627G>A,YPR144C.:p.Gln209%3D,Q06512,UPI0000053033,PANTHER:PTHR12455,,VAL_00242
24792#SNP_avoidExcludedSeqs,24792,GAGTTACTGTCTGTTTTCCTATGATTTACTAAAGCTAACCATTTAC...,SNP_avoidExcludedSeqs,0,XVI,820797,C,[T],COM_4811,820801,...,209,Q,caG/caA,YPR144C_mRNA.:c.627G>A,YPR144C.:p.Gln209%3D,Q06512,UPI0000053033,PANTHER:PTHR12455,,VAL_00242
24793#SNP_avoidExcludedSeqs,24793,GAGTTACTGTCTGTTTTCCTTTTACTAAAGCTAACCATTTACCAAC...,SNP_avoidExcludedSeqs,0,XVI,820797,C,[T],COM_4811,820805,...,209,Q,caG/caA,YPR144C_mRNA.:c.627G>A,YPR144C.:p.Gln209%3D,Q06512,UPI0000053033,PANTHER:PTHR12455,,VAL_00242


In [133]:
# after redesigning oligos for the selected validation variants, inspect them
known_effects_oligo_updated_file = '/home/users/rang/crispey3/initial_design/Output/all_SNPs_epival_GG_9bp_OLIGO.tab'
known_effects_oligo_df = pd.read_csv(known_effects_oligo_updated_file, sep='\t', index_col=0)
display(known_effects_oligo_df)

# count OLIGOS in each set
print("Number of TDH3 promoter oligos:", len(known_effects_oligo_df.filter(like='TDH',axis=0)))
print("Number of validation set oligos:", len(known_effects_oligo_df.filter(like='VAL',axis=0)))

# determine number of pools to use
num_of_oligos = len(known_effects_oligo_df)
min_pool_size = 109 # allows 12 technical oligos
max_pool_size = 118 # allows 3 technical oligos
complete_pool_size = 121 # maximum oligos synthesized per pool

# find number of pools to use
find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size)

# # write known_effects_oligo_df to file for pooling assignment
known_effects_oligo_df.to_csv('/home/users/rang/scratch/yeast/epistasis/epival_variants_final.txt', sep='\t')


Unnamed: 0_level_0,chrom,SNP_chr_pos,REF,ALT,guide_id,guide_strand,guide0_chr_pos,guide_cut_chr_pos,SNP_pos_in_guide,guide_index_in_snp,...,donor_seq_shift,donor_mut_pos_in_guide,donor_info_str,set_name,contains_excluded_seqs,pool,barcode_seq,barcode_id,oligo_seq,oligo_id
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TDH_00001,VII,884068,C,[T],guide_1,+,884073,884070,-6,1,...,14,-6,REF2ALT:C>T,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_1#guide_1:REF2ALT:offset14:donorI...
TDH_00001,VII,884068,C,[T],guide_2,+,884074,884071,-7,2,...,14,-7,REF2ALT:C>T,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_2#guide_2:REF2ALT:offset14:donorI...
TDH_00002,VII,884073,G,[A],guide_3,+,884073,884070,-1,1,...,14,-1,REF2ALT:G>A,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_3#guide_3:REF2ALT:offset14:donorI...
TDH_00002,VII,884073,G,[A],guide_4,+,884074,884071,-2,2,...,14,-2,REF2ALT:G>A,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_4#guide_4:REF2ALT:offset14:donorI...
TDH_00003,VII,884105,C,[T],guide_5,+,884105,884102,-1,1,...,14,-1,REF2ALT:C>T,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_5#guide_5:REF2ALT:offset14:donorI...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VAL_00243,XVI,868939,A,[G],guide_704,+,868941,868938,-3,1,...,14,-3,REF2ALT:A>G,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_704#guide_704:REF2ALT:offset14:do...
VAL_00243,XVI,868939,A,[G],guide_705,-,868936,868940,-2,2,...,-14,-2,REF2ALT:A>G,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_705#guide_705:REF2ALT:offset-14:d...
VAL_00243,XVI,868939,A,[G],guide_706,-,868931,868935,-7,3,...,-14,-7,REF2ALT:A>G,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_706#guide_706:REF2ALT:offset-14:d...
VAL_00243,XVI,868939,A,[G],guide_707,-,868930,868934,-8,4,...,-14,-8,REF2ALT:A>G,epival,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,epival#guide_707#guide_707:REF2ALT:offset-14:d...


Number of TDH3 promoter oligos: 42
Number of validation set oligos: 650
Total number of oligos to fit: 692


(5, 6)

Use 5 pools:
Number of oligos per pool: 118
Number of oligos leftover: 102
Number of technical oligos: 15

Use 6 pools:
Number of oligos per pool: 115
Number of oligos leftover: 2
Number of technical oligos: 36



### Variant ID prefix and which genomes they can be edited in:
- TDH: BY | --- | --- | ---
- VAL: BY | RM | YJM | YPS

### We should be able to fit known effects variants in 6 pools
- TDH, VAL: 6 pools


# Selecting variants for GxE screen
Shi-An provided a list of QTLs identified with condition-specific fitness effects. Search QTL (and gene(s) within it) for variants to screen for GxE project

In [179]:
def select_variants_from_qtls(qtls_df, vcf_file, chrom_dict, upstream_length=500, downstream_length=500):
    '''
    modified version of select_variants function that accepts a dataframe of QTLs
    (requires qtl_left and qtl_right columns defined, and assumes each QTL is associated with one gene)
    defines search boundaries for each QTL and then searches vcf for variants within it
    can be parallelized by splitting genes and vcf by chromosome
    '''
    # find gene upstream and downstream boundaries
    qtls_df.loc[qtls_df['strand']=='+', 'gene_left'] = qtls_df['start']-upstream_length
    qtls_df.loc[qtls_df['strand']=='+', 'gene_right'] = qtls_df['end']+downstream_length
    qtls_df.loc[qtls_df['strand']=='-', 'gene_left'] = qtls_df['start']-downstream_length
    qtls_df.loc[qtls_df['strand']=='-', 'gene_right'] = qtls_df['end']+upstream_length
    
    # define search boundaries by either gene or QTL boundary, whichever is larger
    qtls_df.loc[:,'search_left'] = qtls_df[['gene_left', 'qtl_left']].min(axis=1)
    qtls_df.loc[:,'search_right'] = qtls_df[['gene_right', 'qtl_right']].max(axis=1)
    qtls_df.loc[:,'search_left'] = qtls_df['search_left'].astype(int)
    qtls_df.loc[:,'search_right'] = qtls_df['search_right'].astype(int)
    
    # from these search boundaries, remove duplicates
    search_boundaries_df = qtls_df[['chrom', 'search_left', 'search_right']].drop_duplicates()

    # read in vcf file to parse for variants
    allVars = vcf.Reader(filename=vcf_file)
    candVars = []
    for i, row in search_boundaries_df.iterrows():
        retrieve = allVars.fetch(chrom=chrom_dict[row['chrom']], 
                                 start=row['search_left']-1,
                                 end=row['search_right'])
        for rec in retrieve:
            rec.samples = []
            rec.FORMAT=None
            candVars.append(rec)
    candVars = pd.Series(candVars).drop_duplicates().reset_index(drop=True)
    return candVars

In [174]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/GxE/"
# list of QTLs to parse for GxE variants
gxe_qtls_file = working_dir+"GxE_gene_selection_20201130.tsv"

os.chdir(working_dir)

In [175]:
# read in GxE QTL table
gxe_qtls = pd.read_csv(gxe_qtls_file, sep='\t', index_col=0)
gxe_qtls = gxe_qtls.rename(columns={'Systematic Name':'gene_id', 'Gene ID':'gene_name', 'Chr':'chrom'})#.set_index('gene_id')
# add gene strand and description annotations to gxe QTLs
gxe_qtls = gxe_qtls.merge(gene_info_df[['strand', 'description']], how='left', left_on='gene_id', right_index=True)

# remove QTLs that involve blacklisted genes
genes_to_exclude = ['YDL227C', # HO
                    'YOR202W', # HIS3
                    'YEL021W', # URA3
                    'YCL018W', # LEU2
                    'YBR115C', # LYS2
                    'YBR020W', # GAL1
                    'YDR009W', # GAL3
                    'YPL248C', # GAL4
                    'YBR018C', # GAL7
                    'YBR019C', # GAL10
                    'YML051W', # GAL80
                    'YLR256W'] # HAP1
                   
genes_to_exclude = genes_to_exclude # note: ERG1 and ERG11 oligos will be duplicated in GxE set, has ~180 oligos
gxe_qtls = gxe_qtls.query('~gene_id.isin(@genes_to_exclude)')


# get list of 1,011 yeast strains used in QTL mapping
yeast_strains_id = gxe_qtls['Strain ID of Parent 1 in Peter et al. 2018'].dropna().unique()
yeast_strains_name_to_id = dict(zip(gxe_qtls['Parent 1'].apply(lambda x: x.split(' ')[0]),
                                    gxe_qtls['Strain ID of Parent 1 in Peter et al. 2018']))
display(gxe_qtls)


Unnamed: 0,trait,chr,pos,cross,forward_scan_marker,index,p-value,q-value,LOD,relocalized_peak,...,Beta_abs,qtl_genes,num_qtl_genes,gene_id,gene_name,chrom,start,end,strand,description
0,Caffeine;15mM;2,chrXV,624258,B,chrXV_620334_A_G_61508,18833,0.0001,0.0001,74.281303,chrXV_624258_G_A_61750,...,0.909699,['YOR153W'],1,YOR153W,PDR5,chrXV,619840,624375,+,Plasma membrane ATP-binding cassette (ABC) tra...
1,Fluconazole;100uM;2,chrXV,621444,B,chrXV_621444_A_G_61540,18839,0.0001,0.0001,117.931452,chrXV_621444_A_G_61540,...,0.920556,['YOR153W'],1,YOR153W,PDR5,chrXV,619840,624375,+,Plasma membrane ATP-binding cassette (ABC) tra...
2,Fluconazole;100uM;2,chrXV,619941,377,chrXV_619941_G_T_47212,23734,0.0001,0.0001,168.252715,chrXV_619941_G_T_47212,...,0.711397,['YOR153W'],1,YOR153W,PDR5,chrXV,619840,624375,+,Plasma membrane ATP-binding cassette (ABC) tra...
3,Fluconazole;100uM;2,chrXV,622413,3049,chrXV_622413_A_G_72894,25133,0.0001,0.0001,120.009983,chrXV_622413_A_G_72894,...,0.607320,['YOR153W'],1,YOR153W,PDR5,chrXV,619840,624375,+,Plasma membrane ATP-binding cassette (ABC) tra...
4,Fluconazole;100uM;2,chrXV,624320,377,chrXV_474477_C_A_46484,23322,0.0009,0.0003,7.932991,chrXV_624320_TTTTC_T_47455,...,0.107987,['YOR153W'],1,YOR153W,PDR5,chrXV,619840,624375,+,Plasma membrane ATP-binding cassette (ABC) tra...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Neomycin;5mg/mL;2,chrXV,74126,381,chrXV_74126_C_T_49092,19288,0.0001,0.0001,79.558526,chrXV_74126_C_T_49092,...,0.345004,['YOL130W'],1,YOL130W,ALR1,chrXV,74400,76979,+,"Plasma membrane Mg(2%2B) transporter, expressi..."
95,Neomycin;5mg/mL;2,chrIV,545729,3000,chrIV_545729_C_A_11919,4657,0.0001,0.0001,41.519803,chrIV_545729_C_A_11919,...,0.329880,['YDR044W'],1,YDR044W,HEM13,chrIV,546642,547628,+,"Coproporphyrinogen III oxidase, an oxygen requ..."
96,Neomycin;5mg/mL;2,chrIV,590540,B,chrIV_590540_T_C_12080,3384,0.0001,0.0001,44.270147,chrIV_590540_T_C_12080,...,0.301252,['YDR072C'],1,YDR072C,IPT1,chrIV,589761,591344,-,"Inositolphosphotransferase, involved in synthe..."
97,Neomycin;5mg/mL;2,chrXII,256508,393,chrXII_256370_C_G_49854,18962,0.0001,0.0001,47.917048,chrXII_256508_A_G_49856,...,0.282061,['YLR057W'],1,YLR057W,MNL2,chrXII,255306,257855,+,Putative protein of unknown function; YLR050W ...


In [182]:
# search GxE QTLs for variants in each gvcf
chromosome_dict = {'chrI' : 'chromosome1',
                   'chrII': 'chromosome2',
                   'chrIII': 'chromosome3',
                   'chrIV': 'chromosome4',
                   'chrV': 'chromosome5',
                   'chrVI': 'chromosome6',
                   'chrVII': 'chromosome7',
                   'chrVIII': 'chromosome8',
                   'chrIX': 'chromosome9',
                   'chrX': 'chromosome10',
                   'chrXI': 'chromosome11',
                   'chrXII': 'chromosome12',
                   'chrXIII': 'chromosome13',
                   'chrXIV': 'chromosome14',
                   'chrXV': 'chromosome15',
                   'chrXVI': 'chromosome16'}



qtls_to_process = []
for chrom in chromosome_dict.keys():
    qtls_df = gxe_qtls.query('chrom==@chrom').sort_values('pos')
    gvcf_file = "/home/users/rang/share/yeast/1011genomes/by_chrom/{}.gvcf.gz".format(chromosome_dict[chrom])
    qtls_to_process.append((qtls_df, gvcf_file, chromosome_dict))
    
# consolidate candidate variants
num_of_cores = min(len(os.sched_getaffinity(0)), len(qtls_to_process))
with mp.Pool(num_of_cores) as pool:
    results = pool.starmap(select_variants_from_qtls, qtls_to_process)
results = pd.concat(results).reset_index(drop=True)

# write to new vcf file
gxe_variants_file = "gxe_variants.vcf"
template = vcf.Reader(filename=qtls_to_process[0][1])
# change chromosome naming convention to a single Roman numeral (follows ref genome, VEP formatting)
chromosome_dict_rev = {v:k for k, v in chromosome_dict.items()} 
template.contigs = OrderedDict((chromosome_dict_rev[k][3:], v._replace(id=chromosome_dict_rev[k][3:])) for k, v in template.contigs.items())
# remove 1011 genomes strain allele info
template.samples = []

output = vcf.Writer(open(gxe_variants_file, 'w'), template)
for record in results:
    # update chromosome naming
    record.CHROM = chromosome_dict_rev[record.CHROM][3:]
    # write to file
    output.write_record(record)
output.close()

In [183]:
# path to genome fasta files
ref_genome_fasta = "/home/users/rang/yeast/genomes/Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa"

# path to input and output VCF files
gxe_variants_file = "gxe_variants.vcf"
gxe_variants_select_file = gxe_variants_file.replace('.vcf', '_select.vcf')

# load genome files
ref_genome = SeqIO.to_dict(SeqIO.parse(ref_genome_fasta, "fasta"))
genomes_list = [ref_genome] # IMPORTANT: SET REFERENCE GENOME AS FIRST IN LIST


threshold = 10 # threshold value for searching NGGs
window = 61 # size of flanking sequences to search for unique hits

# read in all candidate variants to filter
gxe_variants = vcf.Reader(filename=gxe_variants_file)
gxe_variants = [rec for rec in gxe_variants]
# split variants into chunks for multiprocessing
n = len(gxe_variants)//len(os.sched_getaffinity(0)) + 1
gxe_variants = [gxe_variants[i:i+n] for i in range(0, len(gxe_variants), n)]

# find targetable variants (TGT info field indicates which genomes variant is targetable in)
to_process = [(rec_list, genomes_list, threshold, window) for rec_list in gxe_variants]
with mp.Pool(len(os.sched_getaffinity(0))) as pool:
    result = pool.starmap(find_targetable_variants, to_process) 

# write targetable variants to file
header = vcf.Reader(filename=gxe_variants_file)
# add "TGT" info field into VCF header
header_info = namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
header_info_tgt = header_info('TGT', '.', 'Integer', 'Targetability of variant in tested genomes', None, None)
header.infos['TGT'] = header_info_tgt
# write to file
gxe_variants_select = vcf.Writer(open(gxe_variants_select_file, 'w'), header)
to_write = [rec for rec_list in result for rec in rec_list]
for rec in to_write:
    gxe_variants_select.write_record(rec)
gxe_variants_select.close()


Variant at chrom V, pos 86643 skipped: No unique matches found in tested genome(s)
Variant at chrom V, pos 86648 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146273 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146275 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146276 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146413 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146415 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146416 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146429 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146430 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146483 skipped: No unique matches found in tested genome(s)
Variant at chrom VIII, pos 146492 skipped: No uniqu

Variant at chrom IV, pos 537889 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 537892 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 537897 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 537913 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 537915 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 537934 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538008 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538009 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538024 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538030 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538063 skipped: No unique matches found in tested genome(s)
Variant at chrom IV, pos 538066 skipped: No unique matches found 

In [184]:
# select variants by allele frequency (currently disabled singleton/doubleton filter)
### if multiple alternate alleles, SPLIT VARIANT ENTRY INTO SEPARATE ENTRIES in VCF (useful for oligo design pipeline)
### assign unitl variant ID prefix and number (use format VAR_XXXXX)

# candidate variants editable in all strains
gxe_variants_select = vcf.Reader(filename=gxe_variants_select_file)

# open output file
gxe_variants_select_AFfilter_file = gxe_variants_select_file.replace('.vcf', '_AFfilter.vcf')
gxe_variants_select_AFfilter = vcf.Writer(open(gxe_variants_select_AFfilter_file, 'w'), gxe_variants_select)

var_counter = {}
for record in gxe_variants_select:        
    # (optional step to skip singletons and doubletons -- DISABLED by setting ac<1)
    if all([ac<1 for ac in record.INFO['AC']]):
        continue
    # if at least one alternate allele meets minimum count, write to VCF
    else:
        alt = record.ALT[:]
        ac = record.INFO['AC'][:]
        af = record.INFO['AF'][:]
        mleac = record.INFO['MLEAC'][:]
        mleaf = record.INFO['MLEAF'][:]
        
        # split each alt allele into its own variant entry (useful if a locus has multiple possible alleles)
        for i in range(len(alt)):
            record.ALT = [alt[i]]
            record.INFO['AC'] = [ac[i]]
            record.INFO['AF'] = [af[i]]
            record.INFO['MLEAC'] = [mleac[i]]
            record.INFO['MLEAF'] = [mleaf[i]]

            # inspect TGT field and assign variant ID prefix
            ### if tgt field contains all 1 (i.e. targetable by all queried genomes), assign "GXE"
            ### otherwise, skip variant
            if all([i==1 for i in record.INFO['TGT']]):
                var_id_prefix = "GXE"
            else:
                print('variant does not pass TGT check: skipping')
                continue
#                 binary_str = ''.join([str(x) for x in record.INFO['TGT']])
#                 hex_digit = hex(int(binary_str, 2))[2:].upper()
#                 var_id_prefix = "EG{}".format(hex_digit)

            # add to counter
            if var_id_prefix not in var_counter:
                var_counter[var_id_prefix] = 1
            else:
                var_counter[var_id_prefix] += 1

            # add unique variant ID (follow format VAR_XXXXX)
            record.ID='{}_{:05d}'.format(var_id_prefix, var_counter[var_id_prefix])

            # write to output
            gxe_variants_select_AFfilter.write_record(record)
            
gxe_variants_select_AFfilter.close()


## Submit gxe_variants_select_AFfilter VCF to VEP to get annotations, and download VEP output in TXT format
http://uswest.ensembl.org/Saccharomyces_cerevisiae/Tools/VEP

In [185]:
# add annotations to variants and remove variants that fall in non-gxe genes.

gxe_variants_file = "gxe_variants.vcf"
gxe_variants_select_file = gxe_variants_file.replace('.vcf', '_select.vcf')

# get VEP annotations and remove variants that fall in non-gxe pathway genes
gxe_variants_select_AFfilter_file = gxe_variants_select_file.replace('.vcf', '_AFfilter.vcf')
vep_output_filename = gxe_variants_select_AFfilter_file.replace('.vcf', '_VEPoutput.txt')
variants_annotated_filename = vep_output_filename.replace('_VEPoutput.txt', '_annotated.txt')
variants_annotated_df = annotate_variants_by_VEPoutput(gxe_variants_select_AFfilter_file, 
                                                       vep_output_filename, 
                                                       gff_file, 
                                                       variants_annotated_filename, 
                                                       id_colname='var_id')
display(variants_annotated_df)

# annotated file can be read in directly if annotation function has been run once before
variants_annotated_df = pd.read_csv(variants_annotated_filename, sep='\t', index_col=0)
display(variants_annotated_df)

# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
gxe_design_oligos = variants_annotated_df.query('(Gene_ID.isin(@gxe_qtls.gene_id.unique()) | region.isin(@intergenic_regions_list))').copy()
display(gxe_design_oligos)

# write variants to gxe_design_oligos VCF as a preliminary list to design oligos for
gxe_design_oligos_vcf_file = gxe_variants_file.replace('.vcf', '_design_oligos_initial.vcf')

gxe_variants_select_AFfilter = vcf.Reader(filename=gxe_variants_select_AFfilter_file)
gxe_design_oligos_vcf = vcf.Writer(open(gxe_design_oligos_vcf_file, 'w'), gxe_variants_select_AFfilter)
for record in gxe_variants_select_AFfilter:
    if record.ID in gxe_design_oligos.index:
        gxe_design_oligos_vcf.write_record(record)
gxe_design_oligos_vcf.close()


parsing entry: 5000
parsing entry: 10000
parsing entry: 15000


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00001,I,192136,G,A,2,2018,0.000991,483,4404|3325|201,synonymous_variant,...,cgG/cgA,"Low_complexity_(Seg):seg,PANTHER:PTHR22589,PAN...",,,,,,,,
GXE_00002,I,192158,CCCGCTGCCAGGA,C,114,2020,0.056000,449,4427|3348|167,inframe_deletion,...,,,,,,,,,,
GXE_00003,I,192162,C,T,2,2020,0.000990,457,4430|3351|175,missense_variant,...,gCt/gTt,"Low_complexity_(Seg):seg,PANTHER:PTHR22589,PAN...",,,,,,,,
GXE_00004,I,192164,G,C,5,2020,0.002475,455,4432|3353|173,missense_variant,...,Gcc/Ccc,"Low_complexity_(Seg):seg,PANTHER:PTHR22589,PAN...",,,,,,,,
GXE_00005,I,192167,AGG,A,1,2020,0.000495,450,4436|3357|168,frameshift_variant,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17204,XVI,375568,C,G,16,2022,0.007913,4905,3552|1723|399|2647,missense_variant,...,Cat/Gat,"PIRSF:PIRSF000350,PANTHER:PTHR42737,Superfamil...",,,,,,,,
GXE_17205,XVI,375603,G,A,4,2022,0.001978,4940,3517|1688|434|2682,synonymous_variant,...,ggG/ggA,"Gene3D:3.50.50.60,Pfam:PF07992,PIRSF:PIRSF0003...",,,,,,,,
GXE_17206,XVI,375623,G,A,2,2022,0.000989,4960,3497|1668|454|2702,missense_variant,...,aGa/aAa,"Gene3D:3.50.50.60,Pfam:PF07992,PIRSF:PIRSF0003...",,,,,,,,
GXE_17207,XVI,375637,G,A,6,2022,0.002967,4974,3483|1654|468|2716,missense_variant,...,Ggt/Agt,"Gene3D:3.50.50.60,Pfam:PF07992,PIRSF:PIRSF0003...",,,,,,,,


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00017,I,192257,G,A,8,2006,0.003988,362,4525|3446|80|1,intergenic,...,,,YAT1,YAR035W,downstream_gene_variant,1.0,,YAR035C-A,downstream_gene_variant,80.0
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,2.0,SWH1,YAR042W,upstream_gene_variant,200.0
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,7.0,SWH1,YAR042W,upstream_gene_variant,181.0
GXE_00025,I,192424,GTTTGGATTACCTCT,G,2,2014,0.000993,8|181,4693|3614|169,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,8.0,SWH1,YAR042W,upstream_gene_variant,181.0
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,10.0,SWH1,YAR042W,upstream_gene_variant,192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17195,XVI,375403,C,T,26,2022,0.013000,99|4740,3717|1888|234|2482,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,234.0,GLR1,YPL091W,upstream_gene_variant,99.0
GXE_17196,XVI,375403,C,G,2,2022,0.000989,99|4740,3717|1888|234|2482,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,234.0,GLR1,YPL091W,upstream_gene_variant,99.0
GXE_17197,XVI,375405,C,T,12,2022,0.005935,97|4742,3715|1886|236|2484,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,236.0,GLR1,YPL091W,upstream_gene_variant,97.0
GXE_17198,XVI,375411,G,T,33,2022,0.016000,91|4748,3709|1880|242|2490,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,242.0,GLR1,YPL091W,upstream_gene_variant,91.0


## Design oligos for initial list, then filter down to variants with at least 2 oligos designed

In [177]:
gxe_oligo_file = os.path.expanduser("~/crispey3/initial_design/Output/all_SNPs_gxe_initial_GG_9bp_OLIGO.tab")
gxe_oligo_df = pd.read_csv(gxe_oligo_file, sep='\t')

gxe_design_oligos = pd.read_csv("/home/users/rang/scratch/yeast/GxE/gxe_variants_select_AFfilter_annotated.txt", sep='\t', index_col=0)
# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
gxe_design_oligos = gxe_design_oligos.query('(Gene_ID.isin(@gxe_qtls.gene_id.unique()) | region.isin(@intergenic_regions_list))').copy()
display(gxe_design_oligos)


gxe_design_oligos['num_of_oligos'] = gxe_oligo_df.groupby('var_id').apply(len)
gxe_oligos_final = gxe_design_oligos.query('num_of_oligos>1').copy()
display(gxe_oligos_final)

Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00017,I,192257,G,A,8,2006,0.003988,362,4525|3446|80|1,intergenic,...,,,YAT1,YAR035W,downstream_gene_variant,1.0,,YAR035C-A,downstream_gene_variant,80.0
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,2.0,SWH1,YAR042W,upstream_gene_variant,200.0
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,7.0,SWH1,YAR042W,upstream_gene_variant,181.0
GXE_00025,I,192424,GTTTGGATTACCTCT,G,2,2014,0.000993,8|181,4693|3614|169,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,8.0,SWH1,YAR042W,upstream_gene_variant,181.0
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,,,,YAR035C-A,upstream_gene_variant,10.0,SWH1,YAR042W,upstream_gene_variant,192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17195,XVI,375403,C,T,26,2022,0.013000,99|4740,3717|1888|234|2482,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,234.0,GLR1,YPL091W,upstream_gene_variant,99.0
GXE_17196,XVI,375403,C,G,2,2022,0.000989,99|4740,3717|1888|234|2482,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,234.0,GLR1,YPL091W,upstream_gene_variant,99.0
GXE_17197,XVI,375405,C,T,12,2022,0.005935,97|4742,3715|1886|236|2484,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,236.0,GLR1,YPL091W,upstream_gene_variant,97.0
GXE_17198,XVI,375411,G,T,33,2022,0.016000,91|4748,3709|1880|242|2490,unidirectional_promoter,...,,,SSU1,YPL092W,downstream_gene_variant,242.0,GLR1,YPL091W,upstream_gene_variant,91.0


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,,,YAR035C-A,upstream_gene_variant,2.0,SWH1,YAR042W,upstream_gene_variant,200.0,3.0
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,,,YAR035C-A,upstream_gene_variant,7.0,SWH1,YAR042W,upstream_gene_variant,181.0,3.0
GXE_00025,I,192424,GTTTGGATTACCTCT,G,2,2014,0.000993,8|181,4693|3614|169,bidirectional_promoter,...,,,YAR035C-A,upstream_gene_variant,8.0,SWH1,YAR042W,upstream_gene_variant,181.0,3.0
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,,,YAR035C-A,upstream_gene_variant,10.0,SWH1,YAR042W,upstream_gene_variant,192.0,2.0
GXE_00027,I,192429,G,C,1506,1998,0.754000,12|190,4697|3618|173,bidirectional_promoter,...,,,YAR035C-A,upstream_gene_variant,12.0,SWH1,YAR042W,upstream_gene_variant,190.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17168,XVI,375080,G,A,31,2022,0.015000,422|4417,4040|2211|2159,missense_variant,...,"PANTHER:PTHR31686,PANTHER:PTHR31686:SF1",,,,,,,,,2.0
GXE_17188,XVI,375331,C,A,8,2020,0.003960,171|4668,3789|1960|162|2410,unidirectional_promoter,...,,SSU1,YPL092W,downstream_gene_variant,162.0,GLR1,YPL091W,upstream_gene_variant,171.0,2.0
GXE_17189,XVI,375333,A,AT,8,2022,0.003956,168|4670,3786|1957|164|2412,unidirectional_promoter,...,,SSU1,YPL092W,downstream_gene_variant,164.0,GLR1,YPL091W,upstream_gene_variant,168.0,3.0
GXE_17192,XVI,375357,C,T,2,2022,0.000989,145|4694,3763|1934|188|2436,unidirectional_promoter,...,,SSU1,YPL092W,downstream_gene_variant,188.0,GLR1,YPL091W,upstream_gene_variant,145.0,2.0


### Remove singletons & doubletons, but preserve those that appear in at least one of the 15 QTL parental strains

In [182]:
# search GVCF and add allele information for parent strains in QTL crosses
chromosome_dict = {'chrI' : 'chromosome1',
                   'chrII': 'chromosome2',
                   'chrIII': 'chromosome3',
                   'chrIV': 'chromosome4',
                   'chrV': 'chromosome5',
                   'chrVI': 'chromosome6',
                   'chrVII': 'chromosome7',
                   'chrVIII': 'chromosome8',
                   'chrIX': 'chromosome9',
                   'chrX': 'chromosome10',
                   'chrXI': 'chromosome11',
                   'chrXII': 'chromosome12',
                   'chrXIII': 'chromosome13',
                   'chrXIV': 'chromosome14',
                   'chrXV': 'chromosome15',
                   'chrXVI': 'chromosome16'}

gxe_variants_file = "gxe_variants.vcf"
i=0
for chromosome, df in gxe_oligos_final.groupby('CHROM'):
    gvcf_file = "/home/users/rang/share/yeast/1011genomes/by_chrom/{}.gvcf.gz".format(chromosome_dict['chr'+chromosome])
    allVars = vcf.Reader(filename=gvcf_file)
    
    for v, row in df.iterrows():
        # find variant in 1011 genomes GVCF
        retrieve = allVars.fetch(chrom=chromosome_dict['chr'+row.CHROM], 
                                 start=row.POS-1,
                                 end=row.POS)
        # select record that matches 
        for r in retrieve:
            if (r.POS==row.POS) & (r.REF==row.REF):
                record = r
                break
                
        # get all alternate alleles at this position
        gxe_oligos_final.loc[v, 'all_alt_alleles'] = ','.join([str(a) for a in record.ALT])
        # get allele calls for strains in yeast_strains_id
        for s in record.samples:
            if (s.sample in yeast_strains_id) and (None not in s.gt_alleles):
                gxe_oligos_final.loc[v, s.sample] = '/'.join(s.gt_alleles)
        
        i+=1
        if i % 500 == 0:
            print(i, "variants processed")
        
# write updated gxe_oligos_final table
gxe_oligos_final.to_csv(gxe_variants_file.replace('.vcf', '_with_parents.txt'), sep='\t')

500 variants processed
1000 variants processed
1500 variants processed
2000 variants processed
2500 variants processed
3000 variants processed
3500 variants processed
4000 variants processed
4500 variants processed
5000 variants processed
5500 variants processed
6000 variants processed


In [183]:
# find singletons and doubletons where at least 1 alt allele found in 15 parental strains used to map QTLs
df = gxe_oligos_final[~( (gxe_oligos_final.loc[:,yeast_strains_id] == '0/0') | (gxe_oligos_final.loc[:,yeast_strains_id].isna()) ).all(axis=1)].query('AC<3')

# check if parental strains carry alt allele that matches variant entry
singletons_doubletons_include_list = []
for v, row in df.iterrows():
    alt_allele_num = row.all_alt_alleles.split(',').index(row.ALT)+1
    if any([str(alt_allele_num) in gt for gt in row[yeast_strains_id].fillna('unknown')]):
        singletons_doubletons_include_list.append(v)
        
# finally, subset gxe_oligos_final to remove singletons and doubletons, unless variant is in singletons_doubletons_include_list
gxe_oligos_final_no_singletons = gxe_oligos_final.query('AC>=3 | var_id.isin(@singletons_doubletons_include_list)')
display(gxe_oligos_final_no_singletons)


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,ACG,ACI,ACQ,ACS,ADE,ADF,ADG,ADI,ADR,SACE_MAA
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,0/0,0/0,0/0,0/0,,0/0,0/0,0/0,0/0,0/0
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,1/1,0/0,0/0,0/0,0/0,0/0,0/0,1/1,1/1,0/0
GXE_00027,I,192429,G,C,1506,1998,0.754000,12|190,4697|3618|173,bidirectional_promoter,...,1/1,1/1,1/1,1/1,,1/1,1/1,1/1,1/1,0/0
GXE_00033,I,192466,AAAGGG,A,3,2020,0.001485,50|148,4735|3656|211,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17166,XVI,375077,T,C,3,2022,0.001484,425|4414,4043|2214|2156,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
GXE_17167,XVI,375078,C,A,8,2020,0.003960,424|4415,4042|2213|2157,stop_gained,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
GXE_17168,XVI,375080,G,A,31,2022,0.015000,422|4417,4040|2211|2159,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
GXE_17188,XVI,375331,C,A,8,2020,0.003960,171|4668,3789|1960|162|2410,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0


In [218]:
display(gxe_qtls.query('chr=="chrXVI"')[['trait', 'qtl_left','qtl_right', 'start', 'end']])
display(qtls_df.query('chrom=="XVI"'))#[['trait', 'qtl_left','qtl_right', 'start', 'end']])

Unnamed: 0,trait,qtl_left,qtl_right,start,end
20,Caffeine;15mM;2,207664,208615,208157,209806
21,Cobalt_Chloride;2mM;2,208615,209367,208157,209806
28,Cobalt_Chloride;2mM;2,373834,375203,373793,375169
29,Cobalt_Chloride;2mM;2,373834,375203,373793,375169
30,Fructose;;1,373674,374422,373793,375169
31,Neomycin;5mg/mL;2,373699,374377,373793,375169
98,Neomycin;5mg/mL;2,181440,181552,181403,182548


Unnamed: 0,trait,chrom,left_limit,right_limit
20,Caffeine;15mM;2,XVI,207657,210306
21,Cobalt_Chloride;2mM;2,XVI,207657,210306
28,Cobalt_Chloride;2mM;2,XVI,373293,375669
30,Fructose;;1,XVI,373293,375669
31,Neomycin;5mg/mL;2,XVI,373293,375669
98,Neomycin;5mg/mL;2,XVI,180903,183048


In [184]:
# use QTL/gene coordinates to associate each variant with traits in gxe_qtls
qtls_df = gxe_qtls[['trait', 'chrom', 'start', 'end', 'qtl_left', 'qtl_right']].copy()
qtls_df.loc[:,'chrom'] = qtls_df['chrom'].str.replace('chr', '')
# find gene upstream and downstream boundaries
qtls_df.loc[:, 'gene_left'] = qtls_df['start']-500
qtls_df.loc[:, 'gene_right'] = qtls_df['end']+500
# define search boundaries by either gene or QTL boundary, whichever is larger
qtls_df.loc[:,'left_limit'] = qtls_df[['gene_left', 'qtl_left']].min(axis=1)
qtls_df.loc[:,'right_limit'] = qtls_df[['gene_right', 'qtl_right']].max(axis=1)
qtls_df.loc[:,'left_limit'] = qtls_df['left_limit'].astype(int)
qtls_df.loc[:,'right_limit'] = qtls_df['right_limit'].astype(int)
qtls_df.loc[:, 'region_length'] = qtls_df['right_limit'] - qtls_df['left_limit'] + 1
# remove search boundaries duplicates
qtls_df = qtls_df[['trait', 'chrom', 'left_limit', 'right_limit']].drop_duplicates()

# associate each variant with traits of interest if it falls within QTL/gene search boundaries
gxe_oligos_final_no_singletons.loc[:,'assoc_traits'] = gxe_oligos_final_no_singletons.apply(lambda x: qtls_df.query('chrom==@x.CHROM & left_limit<=@x.POS & right_limit>=@x.POS').trait.unique().tolist(), axis=1)
gxe_oligos_final_no_singletons.loc[:,'assoc_traits_str'] = gxe_oligos_final_no_singletons['assoc_traits'].apply(lambda x: '_'.join(x))

display(gxe_oligos_final_no_singletons)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,ACQ,ACS,ADE,ADF,ADG,ADI,ADR,SACE_MAA,assoc_traits,assoc_traits_str
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GXE_00023,I,192419,C,A,6,2020,0.002970,2|200,4687|3608|163,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fluconazole;100uM;2, N...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...
GXE_00024,I,192424,GTTTGGATTACCTCT,ATTTGGATTACCTCT,39,2014,0.019000,7|181,4692|3613|168,bidirectional_promoter,...,0/0,0/0,,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fluconazole;100uM;2, N...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...
GXE_00026,I,192427,T,C,902,2000,0.451000,10|192,4695|3616|171,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,1/1,1/1,0/0,"[Cobalt_Chloride;2mM;2, Fluconazole;100uM;2, N...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...
GXE_00027,I,192429,G,C,1506,1998,0.754000,12|190,4697|3618|173,bidirectional_promoter,...,1/1,1/1,,1/1,1/1,1/1,1/1,0/0,"[Cobalt_Chloride;2mM;2, Fluconazole;100uM;2, N...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...
GXE_00033,I,192466,AAAGGG,A,3,2020,0.001485,50|148,4735|3656|211,bidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fluconazole;100uM;2, N...",Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neom...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GXE_17166,XVI,375077,T,C,3,2022,0.001484,425|4414,4043|2214|2156,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fructose;;1, Neomycin;...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...
GXE_17167,XVI,375078,C,A,8,2020,0.003960,424|4415,4042|2213|2157,stop_gained,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fructose;;1, Neomycin;...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...
GXE_17168,XVI,375080,G,A,31,2022,0.015000,422|4417,4040|2211|2159,missense_variant,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fructose;;1, Neomycin;...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...
GXE_17188,XVI,375331,C,A,8,2020,0.003960,171|4668,3789|1960|162|2410,unidirectional_promoter,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,"[Cobalt_Chloride;2mM;2, Fructose;;1, Neomycin;...",Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg...


In [185]:
# write gxe_oligos_final_no_singletons table to file
gxe_oligos_final_no_singletons.to_csv(gxe_variants_file.replace('.vcf', '_final.txt'), sep='\t')

# use this file in separate script to organize into pools

## Summary of GxE variants

In [186]:
### SUMMARY OF GxE SET ###
print("Number of variants:", len(gxe_oligos_final_no_singletons))
print("Number of oligos:", int(sum(gxe_oligos_final_no_singletons.num_of_oligos)))
print()
print("Distribution of variants among gxe genes")
print(gxe_oligos_final_no_singletons.Gene_ID.value_counts().filter(gxe_qtls.gene_id.unique()).describe())
print()
print("Distribution of oligos among gxe variants")
print(gxe_oligos_final_no_singletons.num_of_oligos.describe())
print()
print("Proportion of variant annotation classes")
print(gxe_oligos_final_no_singletons.region.value_counts()/len(gxe_oligos_final_no_singletons))


Number of variants: 4384
Number of oligos: 10983

Distribution of variants among gxe genes
count     53.000000
mean      73.245283
std       44.006951
min       10.000000
25%       39.000000
50%       68.000000
75%       98.000000
max      189.000000
Name: Gene_ID, dtype: float64

Distribution of oligos among gxe variants
count    4384.000000
mean        2.505246
std         0.808900
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: num_of_oligos, dtype: float64

Proportion of variant annotation classes
synonymous_variant         0.377965
missense_variant           0.288549
bidirectional_promoter     0.140511
unidirectional_promoter    0.123859
intergenic                 0.055201
frameshift_variant         0.008212
stop_gained                0.003650
inframe_insertion          0.001369
inframe_deletion           0.000456
stop_lost                  0.000228
Name: region, dtype: float64


## Determine number of pools to use for GxE oligos

In [187]:
print('Number of variants per condition(s)')
print(gxe_oligos_final_no_singletons.groupby('assoc_traits_str').size())
print()
print('Number of oligos per condition(s)')
print(gxe_oligos_final_no_singletons.groupby('assoc_traits_str').num_of_oligos.sum())
print()

# count variants and oligos per trait
for trait in gxe_qtls.trait.unique():
    print("Trait:", trait)
    print('Number of variants:', len(gxe_oligos_final_no_singletons[gxe_oligos_final_no_singletons.assoc_traits.apply(lambda x: trait in x)]) )
    print('Number of oligos:', int(gxe_oligos_final_no_singletons[gxe_oligos_final_no_singletons.assoc_traits.apply(lambda x: trait in x)]['num_of_oligos'].sum()) )
    print()
    

Number of variants per condition(s)
assoc_traits_str
Caffeine;15mM;2                                                879
Caffeine;15mM;2_Cobalt_Chloride;2mM;2                           41
Caffeine;15mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2          163
Caffeine;15mM;2_Fructose;;1                                      1
Caffeine;15mM;2_Neomycin;5mg/mL;2                                5
Cobalt_Chloride;2mM;2                                          611
Cobalt_Chloride;2mM;2_Fluconazole;100uM;2                       74
Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Fructose;;1            4
Cobalt_Chloride;2mM;2_Fluconazole;100uM;2_Neomycin;5mg/mL;2    172
Cobalt_Chloride;2mM;2_Fructose;;1                               70
Cobalt_Chloride;2mM;2_Fructose;;1_Neomycin;5mg/mL;2             48
Fluconazole;100uM;2                                            531
Fluconazole;100uM;2_Fructose;;1                                 32
Fructose;;1                                                    632
Lithium_C

In [188]:
# count OLIGOS in each variant prefix set
display(gxe_oligos_final_no_singletons.groupby(lambda x: x[:3]).num_of_oligos.sum().sort_values(ascending=False))

# determine number of pools to use
num_of_oligos = int(gxe_oligos_final_no_singletons.num_of_oligos.sum())
min_pool_size = 109 # allows 12 technical oligos
max_pool_size = 118 # allows 3 technical oligos
complete_pool_size = 121 # maximum oligos synthesized per pool

# find number of pools to use
find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size)


GXE    10983.0
Name: num_of_oligos, dtype: float64

Total number of oligos to fit: 10983


(93, 100)

Use 93 pools:
Number of oligos per pool: 118
Number of oligos leftover: 9
Number of technical oligos: 279

Use 94 pools:
Number of oligos per pool: 116
Number of oligos leftover: 79
Number of technical oligos: 470

Use 95 pools:
Number of oligos per pool: 115
Number of oligos leftover: 58
Number of technical oligos: 570

Use 96 pools:
Number of oligos per pool: 114
Number of oligos leftover: 39
Number of technical oligos: 672

Use 97 pools:
Number of oligos per pool: 113
Number of oligos leftover: 22
Number of technical oligos: 776

Use 98 pools:
Number of oligos per pool: 112
Number of oligos leftover: 7
Number of technical oligos: 882

Use 99 pools:
Number of oligos per pool: 110
Number of oligos leftover: 93
Number of technical oligos: 1089

Use 100 pools:
Number of oligos per pool: 109
Number of oligos leftover: 83
Number of technical oligos: 1200



array([ 93,  94,  95,  96,  97,  98,  99, 100])

### We should be able to fit all GxE variants in 94 pools
- GXE: 94 pools


# Create a small list of HSP90 variants as a potential rotation project

In [150]:
# set working directory
working_dir = "/home/users/rang/scratch/yeast/hsp90/"
os.chdir(working_dir)

# load gene_info_df from GFF file (see code above)

# find HSP90 genes (HSP82, HSC82)
hsp90_genes = gene_info_df.loc[['YPL240C', 'YMR186W'],:].sort_values(['chrom', 'start'])

In [17]:
# select variants
chromosome_dict = {'chrI' : 'chromosome1',
                   'chrII': 'chromosome2',
                   'chrIII': 'chromosome3',
                   'chrIV': 'chromosome4',
                   'chrV': 'chromosome5',
                   'chrVI': 'chromosome6',
                   'chrVII': 'chromosome7',
                   'chrVIII': 'chromosome8',
                   'chrIX': 'chromosome9',
                   'chrX': 'chromosome10',
                   'chrXI': 'chromosome11',
                   'chrXII': 'chromosome12',
                   'chrXIII': 'chromosome13',
                   'chrXIV': 'chromosome14',
                   'chrXV': 'chromosome15',
                   'chrXVI': 'chromosome16',}

results = []
for gene, row in hsp90_genes.iterrows():
    gvcf_file = "/home/users/rang/share/yeast/1011genomes/by_chrom/{}.gvcf.gz".format(chromosome_dict[row.chrom])
    
    # read in gvcf file to parse for variants
    allVars = vcf.Reader(filename=gvcf_file)
    retrieve = allVars.fetch(chrom=chromosome_dict[row['chrom']], 
                             start=int(row['start']-500-1),
                             end=int(row['end']+500))
    for rec in retrieve:
        rec.samples=[]
        rec.FORMAT=None
        results.append(rec)

results = pd.Series(results).drop_duplicates().reset_index(drop=True)

# write to new vcf file
hsp90_variants_file = "hsp90_variants.vcf"
template = vcf.Reader(filename=gvcf_file)
# change chromosome naming convention to a single Roman numeral (follows ref genome, VEP formatting)
chromosome_dict_rev = {v:k for k, v in chromosome_dict.items()} 
template.contigs = OrderedDict((chromosome_dict_rev[k][3:], v._replace(id=chromosome_dict_rev[k][3:])) for k, v in template.contigs.items())
# remove 1011 genomes samples data
template.samples=[]

output = vcf.Writer(open(hsp90_variants_file, 'w'), template)
for record in results:
    # update chromosome naming
    record.CHROM = chromosome_dict_rev[record.CHROM][3:]
    # write to file
    output.write_record(record)
output.close()


In [18]:
# path to genome fasta files
ref_genome_fasta = "/home/users/rang/yeast/genomes/Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa"
rm_genome_fasta = "/home/users/rang/yeast/genomes/RM11-1A_SGD_2015_JRIP00000000.fsa"
yjm_genome_fasta = "/home/users/rang/yeast/genomes/YJM789_Stanford_2007_AAFW02000000_highQuality31.fsa"
yps_genome_fasta = "/home/users/rang/yeast/genomes/YPS128.genome.fa"

# path to input and output VCF files
hsp90_variants_file = "hsp90_variants.vcf"
hsp90_variants_select_file = hsp90_variants_file.replace('.vcf', '_select.vcf')

# load genome files
ref_genome = SeqIO.to_dict(SeqIO.parse(ref_genome_fasta, "fasta"))
rm_genome = SeqIO.to_dict(SeqIO.parse(rm_genome_fasta, "fasta"))
yjm_genome = SeqIO.to_dict(SeqIO.parse(yjm_genome_fasta, "fasta"))
yps_genome = SeqIO.to_dict(SeqIO.parse(yps_genome_fasta, "fasta"))
genomes_list = [ref_genome, rm_genome, yjm_genome, yps_genome] # IMPORTANT: SET REFERENCE GENOME AS FIRST IN LIST


threshold = 10 # threshold value for searching NGGs
window = 61 # size of flanking sequences to search for unique hits

# read in all candidate variants to filter
hsp90_variants = vcf.Reader(filename=hsp90_variants_file)
hsp90_variants = [rec for rec in hsp90_variants]
# split variants into chunks for multiprocessing
n = len(hsp90_variants)//len(os.sched_getaffinity(0)) + 1
hsp90_variants = [hsp90_variants[i:i+n] for i in range(0, len(hsp90_variants), n)]

# find targetable variants (TGT info field indicates which genomes variant is targetable in)
to_process = [(rec_list, genomes_list, threshold, window) for rec_list in hsp90_variants]
with mp.Pool(len(os.sched_getaffinity(0))) as pool:
    result = pool.starmap(find_targetable_variants, to_process) 

# write targetable variants to file
header = vcf.Reader(filename=hsp90_variants_file)
# add "TGT" info field into VCF header
header_info = namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
header_info_tgt = header_info('TGT', '.', 'Integer', 'Targetability of variant in tested genomes', None, None)
header.infos['TGT'] = header_info_tgt
# write to file
hsp90_variants_select = vcf.Writer(open(hsp90_variants_select_file, 'w'), header)
to_write = [rec for rec_list in result for rec in rec_list]
for rec in to_write:
    hsp90_variants_select.write_record(rec)
hsp90_variants_select.close()


Variant at chrom XIII, pos 634136 skipped: No unique matches found in tested genome(s)
Variant at chrom XIII, pos 634148 skipped: No unique matches found in tested genome(s)
Variant at chrom XIII, pos 634157 skipped: No unique matches found in tested genome(s)
Variant at chrom XIII, pos 634169 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 96797 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 96803 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 96820 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 97542 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 98281 skipped: No unique matches found in tested genome(s)
Variant at chrom XIII, pos 632705 skipped: No unique matches found in tested genome(s)
Variant at chrom XVI, pos 98284 skipped: No unique matches found in tested genome(s)


In [25]:
# select variants by allele frequency (note difference from ergosterol variant selection step)
### remove singletons and doubletons
### if multiple alternate alleles, select most common alternate allele
### assign unitl variant ID prefix and number (use format VAR_XXXXX)

# candidate variants editable in all strains
hsp90_variants_select = vcf.Reader(filename=hsp90_variants_select_file)

# open output file
hsp90_variants_select_AFfilter_file = hsp90_variants_select_file.replace('.vcf', '_AFfilter.vcf')
hsp90_variants_select_AFfilter = vcf.Writer(open(hsp90_variants_select_AFfilter_file, 'w'), hsp90_variants_select)

var_counter = {}
for record in hsp90_variants_select:        
    # (optional) filter by allele count - currently disabled by setting to ac<1
    if all([ac<1 for ac in record.INFO['AC']]):
        continue
    # if at least one alternate allele meets minimum count, select most common alternate allele to study
    else:
        i = np.argmax(record.INFO['AC']) # most common alt allele
        # update record entry
        record.ALT = [record.ALT[i]]
        record.INFO['AC'] = [record.INFO['AC'][i]]
        record.INFO['AF'] = [record.INFO['AF'][i]]
        record.INFO['MLEAC'] = [record.INFO['MLEAC'][i]]
        record.INFO['MLEAF'] = [record.INFO['MLEAF'][i]]
        
        # inspect TGT field and assign variant ID prefix
        ### if tgt field contains 1 at first position (i.e. targetable by reference genome), assign "HSP"
        if record.INFO['TGT'][0] == 1:
            var_id_prefix = "HSP"
        else:
#             print('variant does not pass TGT check: skipping')
            continue
#             # assign unique prefix "HS-" with the last digit determined by converting binary to hexadecimal (works well for up to 4 genomes)
#             binary_str = ''.join([str(x) for x in record.INFO['TGT']])
#             hex_digit = hex(int(binary_str, 2))[2:].upper()
#             var_id_prefix = "HS{}".format(hex_digit)
            
        # add to counter
        if var_id_prefix not in var_counter:
            var_counter[var_id_prefix] = 1
        else:
            var_counter[var_id_prefix] += 1
        
        # add unique variant ID (follow format VAR_XXXXX)
        record.ID='{}_{:05d}'.format(var_id_prefix, var_counter[var_id_prefix])
        
        # write to output
        hsp90_variants_select_AFfilter.write_record(record)
hsp90_variants_select_AFfilter.close()


## Submit hsp90_variants_select_AFfilter VCF to VEP to get annotations, and download VEP output in TXT format
http://uswest.ensembl.org/Saccharomyces_cerevisiae/Tools/VEP

In [26]:
# add annotations to variants and remove variants that fall in genes not listed in interaction_rich_genes
hsp90_variants_file = "hsp90_variants.vcf"
hsp90_variants_select_file = hsp90_variants_file.replace('.vcf', '_select.vcf')

# get VEP annotations and remove variants that fall within genes not in approved list
hsp90_variants_select_AFfilter_file = hsp90_variants_select_file.replace('.vcf', '_AFfilter.vcf')
vep_output_filename = hsp90_variants_select_AFfilter_file.replace('.vcf', '_VEPoutput.txt')
variants_annotated_filename = vep_output_filename.replace('_VEPoutput.txt', '_annotated.txt')
variants_annotated_df = annotate_variants_by_VEPoutput(hsp90_variants_select_AFfilter_file, 
                                                       vep_output_filename, 
                                                       gff_file, 
                                                       variants_annotated_filename, 
                                                       id_colname='var_id')

# annotated file can be read in directly if annotation function has been run once before
variants_annotated_df = pd.read_csv(variants_annotated_filename, sep='\t', index_col=0)
display(variants_annotated_df)

# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
hsp90_design_oligos = variants_annotated_df.query('(Gene_ID.isin(@hsp90_genes.index) | region.isin(@intergenic_regions_list))').copy()

# ##### ADDITIONALLY, since this set is focused on hsp90, only include variants editable in at least 3 out of 4 genomes #####
# hsp90_design_oligos = hsp90_design_oligos.filter(regex='HSP|HSE|HSD|HSB', axis=0)

display(hsp90_design_oligos)
# count variants according to which strains they can be edited in
display(pd.Series([x[:3] for x in hsp90_design_oligos.index]).value_counts())


# write variants to hsp90_design_oligos VCF as a preliminary list to design oligos for
hsp90_design_oligos_vcf_file = hsp90_variants_file.replace('.vcf', '_design_oligos_initial.vcf')
hsp90_variants_select_AFfilter = vcf.Reader(filename=hsp90_variants_select_AFfilter_file)
hsp90_design_oligos_vcf = vcf.Writer(open(hsp90_design_oligos_vcf_file, 'w'), hsp90_variants_select_AFfilter)
for record in hsp90_variants_select_AFfilter:
    if record.ID in hsp90_design_oligos.index:
        hsp90_design_oligos_vcf.write_record(record)
hsp90_design_oligos_vcf.close()


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00001,XIII,631917,T,C,1,2022,0.000495,4109|438,3132|2772|4374,missense_variant,...,Tac/Cac,"Pfam:PF10304,PANTHER:PTHR20959,Superfamily:SSF...",,,,,,,,
HSP_00002,XIII,631956,C,T,342,2020,0.169000,4148|399,3171|2733|4335,synonymous_variant,...,Cta/Tta,"Pfam:PF10304,PANTHER:PTHR20959",,,,,,,,
HSP_00003,XIII,631959,G,C,2,2022,0.000989,4151|396,3174|2730|4332,missense_variant,...,Gat/Cat,"Pfam:PF10304,PANTHER:PTHR20959",,,,,,,,
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSP_00425,XVI,99064,T,C,47,2014,0.023000,4168|1432|420|439|2806|3955,2544,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,439.0,YAR1,YPL239W,upstream_gene_variant,420.0
HSP_00426,XVI,99066,G,A,7,2020,0.003465,4166|1430|418|441|2808|3957,2542,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,441.0,YAR1,YPL239W,upstream_gene_variant,418.0
HSP_00427,XVI,99082,A,G,2,2022,0.000989,4150|1414|402|457|2824|3973,2526,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,457.0,YAR1,YPL239W,upstream_gene_variant,402.0
HSP_00428,XVI,99086,A,G,2,2022,0.000989,4146|1410|398|461|2828|3977,2522,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,461.0,YAR1,YPL239W,upstream_gene_variant,398.0


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0
HSP_00006,XIII,631979,G,A,12,2018,0.005946,4171|376,3194|9|2710|4312,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,376.0
HSP_00007,XIII,631980,C,T,53,2018,0.026000,4172|375,3195|10|2709|4311,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,10.0,HSC82,YMR186W,upstream_gene_variant,375.0
HSP_00008,XIII,631981,A,G,25,2020,0.012000,4173|374,3196|11|2708|4310,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,11.0,HSC82,YMR186W,upstream_gene_variant,374.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSP_00425,XVI,99064,T,C,47,2014,0.023000,4168|1432|420|439|2806|3955,2544,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,439.0,YAR1,YPL239W,upstream_gene_variant,420.0
HSP_00426,XVI,99066,G,A,7,2020,0.003465,4166|1430|418|441|2808|3957,2542,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,441.0,YAR1,YPL239W,upstream_gene_variant,418.0
HSP_00427,XVI,99082,A,G,2,2022,0.000989,4150|1414|402|457|2824|3973,2526,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,457.0,YAR1,YPL239W,upstream_gene_variant,402.0
HSP_00428,XVI,99086,A,G,2,2022,0.000989,4146|1410|398|461|2828|3977,2522,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,461.0,YAR1,YPL239W,upstream_gene_variant,398.0


HSP    403
dtype: int64

## Design oligos for initial list, then filter down to variants with at least 2 oligos designed

In [164]:
hsp90_oligo_file = os.path.expanduser("~/crispey3/initial_design/Output/all_SNPs_hsp90_initial_GG_9bp_OLIGO.tab")
hsp90_oligo_df = pd.read_csv(hsp90_oligo_file, sep='\t')

hsp90_design_oligos = pd.read_csv("/home/users/rang/scratch/yeast/hsp90/hsp90_variants_select_AFfilter_annotated.txt", sep='\t', index_col=0)
# remove variants that fall within genes, but are not in the approved genes list
intergenic_regions_list = ['bidirectional_promoter', 'unidirectional_promoter', 'intergenic']
hsp90_design_oligos = hsp90_design_oligos.query('(Gene_ID.isin(@hsp90_genes.index) | region.isin(@intergenic_regions_list))').copy()
display(hsp90_design_oligos)

hsp90_design_oligos['num_of_oligos'] = hsp90_oligo_df.groupby('var_id').apply(len)
# select oligos that have at least 2 oligos designed
hsp90_oligos_final = hsp90_design_oligos.query('num_of_oligos>1').copy()

display(hsp90_oligos_final)


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,Codons,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0
HSP_00006,XIII,631979,G,A,12,2018,0.005946,4171|376,3194|9|2710|4312,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,376.0
HSP_00007,XIII,631980,C,T,53,2018,0.026000,4172|375,3195|10|2709|4311,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,10.0,HSC82,YMR186W,upstream_gene_variant,375.0
HSP_00008,XIII,631981,A,G,25,2020,0.012000,4173|374,3196|11|2708|4310,unidirectional_promoter,...,,,RTP1,YMR185W,downstream_gene_variant,11.0,HSC82,YMR186W,upstream_gene_variant,374.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSP_00425,XVI,99064,T,C,47,2014,0.023000,4168|1432|420|439|2806|3955,2544,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,439.0,YAR1,YPL239W,upstream_gene_variant,420.0
HSP_00426,XVI,99066,G,A,7,2020,0.003465,4166|1430|418|441|2808|3957,2542,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,441.0,YAR1,YPL239W,upstream_gene_variant,418.0
HSP_00427,XVI,99082,A,G,2,2022,0.000989,4150|1414|402|457|2824|3973,2526,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,457.0,YAR1,YPL239W,upstream_gene_variant,402.0
HSP_00428,XVI,99086,A,G,2,2022,0.000989,4146|1410|398|461|2828|3977,2522,bidirectional_promoter,...,,,HSP82,YPL240C,upstream_gene_variant,461.0,YAR1,YPL239W,upstream_gene_variant,398.0


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0,2.0
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0,2.0
HSP_00006,XIII,631979,G,A,12,2018,0.005946,4171|376,3194|9|2710|4312,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,376.0,2.0
HSP_00007,XIII,631980,C,T,53,2018,0.026000,4172|375,3195|10|2709|4311,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,10.0,HSC82,YMR186W,upstream_gene_variant,375.0,2.0
HSP_00008,XIII,631981,A,G,25,2020,0.012000,4173|374,3196|11|2708|4310,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,11.0,HSC82,YMR186W,upstream_gene_variant,374.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSP_00415,XVI,99017,AGGG,GGGG,209,1914,0.109000,4212|1476|464|392|2759|3908,2588,bidirectional_promoter,...,,HSP82,YPL240C,upstream_gene_variant,392.0,YAR1,YPL239W,upstream_gene_variant,464.0,5.0
HSP_00416,XVI,99018,G,A,243,1896,0.128000,4214|1478|466|393|2760|3909,2590,bidirectional_promoter,...,,HSP82,YPL240C,upstream_gene_variant,393.0,YAR1,YPL239W,upstream_gene_variant,466.0,5.0
HSP_00417,XVI,99019,G,GGGGGGGA,2,1998,0.001001,4212|1476|464|394|2761|3910,2588,bidirectional_promoter,...,,HSP82,YPL240C,upstream_gene_variant,394.0,YAR1,YPL239W,upstream_gene_variant,464.0,5.0
HSP_00418,XVI,99021,G,A,1,2014,0.000496,4211|1475|463|396|2763|3912,2587,bidirectional_promoter,...,,HSP82,YPL240C,upstream_gene_variant,396.0,YAR1,YPL239W,upstream_gene_variant,463.0,3.0


## We can also design synthetic variants for natural HSP90 variants. Take every coding variant and design alternative ALT alleles in its place

In [166]:
hsp90_design_oligos_vcf_file = "/home/users/rang/scratch/yeast/hsp90/hsp90_variants_design_oligos_initial.vcf"
hsp90_synthetic_vars_vcf_file = hsp90_design_oligos_vcf_file.replace('.vcf', '_synthetic_vars.vcf')

hsp90_design_oligos_vcf = vcf.Reader(filename=hsp90_design_oligos_vcf_file)
hsp90_synthetic_vars_vcf = vcf.Writer(open(hsp90_synthetic_vars_vcf_file, 'w'), hsp90_design_oligos_vcf)

syn_var_counter = 0
syn_var_prefix = 'HSX'
syn_alt_alleles = ['A', 'C', 'T', 'G']
syn_insert_dict = {'A':'C',
                   'C':'T',
                   'T':'G',
                   'G':'A',}

for record in hsp90_design_oligos_vcf:
    if record.ID in hsp90_oligos_final.query('~region.isin(@intergenic_regions_list)').index:
        # synthesize single-base alt alleles different from natural alt allele
        ref_allele = record.REF[0]
        for alt_allele in syn_alt_alleles:
            if (alt_allele != ref_allele) and (alt_allele != str(record.ALT[0])):
                syn_var_counter += 1
                syn = vcf.model._Record(CHROM=record.CHROM, POS=record.POS,
                                        ID="{}_{:05d}".format(syn_var_prefix, syn_var_counter), 
                                        REF=ref_allele, ALT=[vcf.model._Substitution(alt_allele)],
                                        QUAL=1, FILTER='.',
                                        INFO={'AC':[0], 'AF':[0], 'AN':record.INFO['AN'], 'TGT':record.INFO['TGT']},
                                        FORMAT=None, sample_indexes=None)
                # write record to synthetic variants VCF
                hsp90_synthetic_vars_vcf.write_record(syn)
        
        # also synthesize 1bp insertion if record is SNP
        if len(record.REF) == len(record.ALT[0]):
            insertion = ref_allele + syn_insert_dict[ref_allele]
            syn_var_counter += 1
            syn = vcf.model._Record(CHROM=record.CHROM, POS=record.POS,
                                    ID="{}_{:05d}".format(syn_var_prefix, syn_var_counter), 
                                    REF=ref_allele, ALT=[vcf.model._Substitution(insertion)],
                                    QUAL=1, FILTER='.',
                                    INFO={'AC':[0], 'AF':[0], 'AN':record.INFO['AN'], 'TGT':record.INFO['TGT']},
                                    FORMAT=None, sample_indexes=None)
            # write record to synthetic variants VCF
            hsp90_synthetic_vars_vcf.write_record(syn)
            
hsp90_synthetic_vars_vcf.close()


## Annotate synthetic variants with VEP, select subset to complement natural variants in hsp90_oligos_final

In [167]:
# get VEP annotations and remove variants that fall within genes not in approved list
hsp90_synthetic_vars_vcf_file = hsp90_design_oligos_vcf_file.replace('.vcf', '_synthetic_vars.vcf')
vep_output_filename = hsp90_synthetic_vars_vcf_file.replace('.vcf', '_VEPoutput.txt')
variants_annotated_filename = vep_output_filename.replace('_VEPoutput.txt', '_annotated.txt')
variants_annotated_df = annotate_variants_by_VEPoutput(hsp90_synthetic_vars_vcf_file, 
                                                       vep_output_filename, 
                                                       gff_file, 
                                                       variants_annotated_filename, 
                                                       id_colname='var_id')

# annotated file can be read in directly if annotation function has been run once before
variants_annotated_df = pd.read_csv(variants_annotated_filename, sep='\t', index_col=0)
# add number of oligos based on that generated for natural variants
for v, row in hsp90_oligos_final.iterrows():
    variants_annotated_df.loc[(variants_annotated_df.CHROM==row['CHROM']) & (variants_annotated_df.POS==row['POS']), 'num_of_oligos'] = row['num_of_oligos']

display(variants_annotated_df)


# select one synthetic variant to match each natural coding mutation
syn_vars_to_keep = []
for v, row in hsp90_oligos_final.iterrows():
    if row['region'] not in intergenic_regions_list:
        df = variants_annotated_df.query('CHROM==@row.CHROM & POS==@row.POS')
        # for every synonymous variant, select a synthetic variants that generates a MODERATE or HIGH annotation impact
        if row['region'] == 'synonymous_variant':
            syn_vars_to_keep += df.query('Annotation_Impact=="MODERATE"').head(1).index.tolist()
            syn_vars_to_keep += df.query('Annotation_Impact=="HIGH"').head(1).index.tolist()
        # for all other regions, select a synonymous variant with LOW annotation impact
        else:
            syn_vars_to_keep += df.query('Annotation_Impact=="LOW"').head(1).index.tolist()

# display(variants_annotated_df.loc[syn_vars_to_keep])
            
# add selected synthetic variants to final list
hsp90_oligos_final = pd.concat([hsp90_oligos_final, variants_annotated_df.loc[syn_vars_to_keep]])[hsp90_oligos_final.columns]
display(hsp90_oligos_final)


Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,CDS_frac,AA_len,AA_frac,cDNA_pos,CDS_pos,AA_pos,Amino_acids,Codons,DOMAINS,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSX_00001,XIII,632471,C,A,0,2018,0.0,4663,3686|501|2218|3820,synonymous_variant,...,0.055241,706.0,0.055241,117.0,117.0,39.0,S,tcC/tcA,"CDD:cd16927,PIRSF:PIRSF002583,Pfam:PF02518,Gen...",2.0
HSX_00002,XIII,632471,C,G,0,2018,0.0,4663,3686|501|2218|3820,synonymous_variant,...,0.055241,706.0,0.055241,117.0,117.0,39.0,S,tcC/tcG,"CDD:cd16927,PIRSF:PIRSF002583,Pfam:PF02518,Gen...",2.0
HSX_00003,XIII,632471,C,CT,0,2018,0.0,4663,3686|501|2217|3819,frameshift_variant,...,,,,,,,,,,2.0
HSX_00004,XIII,632477,T,A,0,2022,0.0,4669,3692|507|2212|3814,synonymous_variant,...,0.058074,706.0,0.058074,123.0,123.0,41.0,A,gcT/gcA,"CDD:cd16927,PIRSF:PIRSF002583,Pfam:PF02518,Gen...",2.0
HSX_00005,XIII,632477,T,C,0,2022,0.0,4669,3692|507|2212|3814,synonymous_variant,...,0.058074,706.0,0.058074,123.0,123.0,41.0,A,gcT/gcC,"CDD:cd16927,PIRSF:PIRSF002583,Pfam:PF02518,Gen...",2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSX_00339,XVI,98464,C,G,0,2002,0.0,,,,...,,,,,,,,,,2.0
HSX_00340,XVI,98464,C,CT,0,2002,0.0,,,,...,,,,,,,,,,2.0
HSX_00341,XVI,98503,C,T,0,2002,0.0,,,,...,,,,,,,,,,2.0
HSX_00342,XVI,98503,C,G,0,2002,0.0,,,,...,,,,,,,,,,2.0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0_level_0,CHROM,POS,REF,ALT,AC,AN,AF,upstream_distance_str,downstream_distance_str,region,...,DOMAINS,closest_gene1_Gene_Name,closest_gene1_Gene_ID,closest_gene1_Annotation,closest_gene1_Distance,closest_gene2_Gene_Name,closest_gene2_Gene_ID,closest_gene2_Annotation,closest_gene2_Distance,num_of_oligos
var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSP_00004,XIII,631977,A,G,56,2022,0.028000,4169|378,3192|7|2712|4314,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,7.0,HSC82,YMR186W,upstream_gene_variant,378.0,2.0
HSP_00005,XIII,631978,CGCATTTGATTATAATTTGCTTCTTAGGCAAAATTAATATTTACGT...,C,6,2022,0.002967,4171|259,3194|9|2593|4195,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,259.0,2.0
HSP_00006,XIII,631979,G,A,12,2018,0.005946,4171|376,3194|9|2710|4312,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,9.0,HSC82,YMR186W,upstream_gene_variant,376.0,2.0
HSP_00007,XIII,631980,C,T,53,2018,0.026000,4172|375,3195|10|2709|4311,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,10.0,HSC82,YMR186W,upstream_gene_variant,375.0,2.0
HSP_00008,XIII,631981,A,G,25,2020,0.012000,4173|374,3196|11|2708|4310,unidirectional_promoter,...,,RTP1,YMR185W,downstream_gene_variant,11.0,HSC82,YMR186W,upstream_gene_variant,374.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSX_00327,XVI,98320,T,G,0,1996,0.000000,4864|2128|1116|2109|3258,3240,frameshift_variant,...,,,,,,,,,,2.0
HSX_00329,XVI,98335,G,A,0,2012,0.000000,4819|2083|1071|2155|3304,3195,missense_variant,...,"Gene3D:3.30.565.10,HAMAP:MF_00505,Pfam:PF02518...",,,,,,,,,3.0
HSX_00330,XVI,98335,G,C,0,2012,0.000000,4818|2082|1070|2155|3304,3194,frameshift_variant,...,,,,,,,,,,3.0
HSX_00334,XVI,98367,C,CT,0,2016,0.000000,4729|1993|981|2245|3394,3105,synonymous_variant,...,"Gene3D:3.30.565.10,HAMAP:MF_00505,Pfam:PF02518...",,,,,,,,,2.0


In [171]:
# write hsp90_oligos_final table to file
hsp90_oligos_final.to_csv("/home/users/rang/scratch/yeast/hsp90/hsp90_variants_final.txt", sep='\t')
# use this file in separate script to organize into pools

# write natural and synthetic variants into single VCF file
template = vcf.Reader(filename="/home/users/rang/scratch/yeast/hsp90/hsp90_variants_design_oligos_initial.vcf")
hsp90_variants_final_vcf = vcf.Writer(open("/home/users/rang/scratch/yeast/hsp90/hsp90_variants_final.vcf", 'w'), template)
 
for record in template:
    if record.ID in hsp90_oligos_final.index:
        hsp90_variants_final_vcf.write_record(record)
        
for record in vcf.Reader(filename="/home/users/rang/scratch/yeast/hsp90/hsp90_variants_design_oligos_initial_synthetic_vars.vcf"):
    if record.ID in hsp90_oligos_final.index:
        hsp90_variants_final_vcf.write_record(record)

hsp90_variants_final_vcf.close()


## Summary of HSP90 variants

In [172]:
### SUMMARY OF HSP90 SET ###
print("Number of variants:", len(hsp90_oligos_final))
print("Number of oligos:", int(sum(hsp90_oligos_final.num_of_oligos)))
print("Number of positions:", len(hsp90_oligos_final[['CHROM','POS']].drop_duplicates()))
print()
print("Distribution of variants among HSP90 genes")
print(hsp90_oligos_final.Gene_ID.value_counts().filter(hsp90_genes.index).describe())
print()
print("Distribution of oligos among HSP90 variants")
print(hsp90_oligos_final.num_of_oligos.describe())
print()
print("Proportion of variant annotation classes")
print(hsp90_oligos_final.region.value_counts()/len(hsp90_oligos_final))


Number of variants: 332
Number of oligos: 781
Number of positions: 173

Distribution of variants among HSP90 genes
count      2.000000
mean     156.500000
std       36.062446
min      131.000000
25%      143.750000
50%      156.500000
75%      169.250000
max      182.000000
Name: Gene_ID, dtype: float64

Distribution of oligos among HSP90 variants
count    332.000000
mean       2.352410
std        0.772973
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        6.000000
Name: num_of_oligos, dtype: float64

Proportion of variant annotation classes
synonymous_variant         0.307229
frameshift_variant         0.271084
missense_variant           0.234940
unidirectional_promoter    0.093373
bidirectional_promoter     0.066265
intergenic                 0.018072
stop_gained                0.009036
Name: region, dtype: float64


## Determine number of pools to use for HSP90 oligos

In [173]:
# count OLIGOS in each variant prefix set
display(hsp90_oligos_final.groupby(lambda x: x[:3]).num_of_oligos.sum().sort_values(ascending=False))

# determine number of pools to use
num_of_oligos = int(hsp90_oligos_final.num_of_oligos.sum())
min_pool_size = 105 # allows 12 technical oligos
max_pool_size = 118 # allows 3 technical oligos
complete_pool_size = 121 # maximum oligos synthesized per pool

# find number of pools to use
find_pool_size_options(num_of_oligos, min_pool_size, max_pool_size, complete_pool_size)

HSP    419.0
HSX    362.0
Name: num_of_oligos, dtype: float64

Total number of oligos to fit: 781


(6, 7)

Use 6 pools:
Number of oligos per pool: 118
Number of oligos leftover: 73
Number of technical oligos: 18

Use 7 pools:
Number of oligos per pool: 111
Number of oligos leftover: 4
Number of technical oligos: 70



array([6, 7])

## Humanized yeast pilot
An assortment of oligos generated from two sets of variant files were combined to get the final list being synthesized
- Fritz Roth curated variants list, taken direct from hYeast ORF fasta (Priority)
- Shi-An's compiled variants from GnomAD (Secondary)

Assemble oligos such that for each gene, we can synthesize 118 oligos.


In [62]:
# fritz roth set
hyeast_oligos_priority_file = os.path.expanduser("~/crispey3/humanized/Output/all_SNPs_humanized_validated_GG_9bp_OLIGO.tab")
hyeast_oligos_priority = pd.read_csv(hyeast_oligos_priority_file, sep='\t')

# Shi-An's consolidated list from gnomAD
hyeast_oligos_secondary_file = os.path.expanduser("~/crispey3/humanized/Output/all_SNPs_humanized_GG_9bp_OLIGO.tab")
hyeast_oligos_secondary = pd.read_csv(hyeast_oligos_secondary_file, sep='\t')
# calc number of oligos per var_id
hyeast_oligos_secondary['num_of_oligos'] = hyeast_oligos_secondary['var_id'].map(hyeast_oligos_secondary.groupby('var_id').size())
# sort by set_name, number of oligos, var_id
hyeast_oligos_secondary = hyeast_oligos_secondary.sort_values(['set_name', 'num_of_oligos', 'var_id'], ascending=[True, False, True])


# to select the final list of oligos for each gene, take everything from the priority list
# and add secondary oligos till total number reaches 118 per gene

final_oligos_df = []
for gene in hyeast_oligos_priority.set_name.unique():
    df = hyeast_oligos_priority.query('set_name==@gene')
    secondary = hyeast_oligos_secondary.query('set_name==@gene')
    df = pd.concat([df, secondary.drop('num_of_oligos', axis=1)])
    # remove duplicate variant entries, and set cutoff for oligos
    df = df.drop_duplicates(subset=['chrom', 'SNP_chr_pos', 'REF', 'ALT', 'guide0_chr_pos']).head(118)
    final_oligos_df.append(df)
    
final_oligos_df = pd.concat(final_oligos_df).reset_index(drop=True)
# relabel guide IDs
guide_num = 20499
for i, row in final_oligos_df.iterrows():
    old_guide_id = row['guide_id']
    new_guide_id = 'guide_{}'.format(guide_num)
    final_oligos_df.loc[i,['guide_id', 'donor_id', 'oligo_id']] = final_oligos_df.loc[i,['guide_id', 'donor_id', 'oligo_id']].apply(str.replace, args=(old_guide_id, new_guide_id))
    guide_num+=1

display(final_oligos_df)

# write to file
final_oligos_df.to_csv(os.path.expanduser("~/crispey3/humanized/Output/all_SNPs_humanized_combined_GG_9bp_OLIGO.tab"), sep='\t', index=False)

Unnamed: 0,var_id,chrom,SNP_chr_pos,REF,ALT,guide_id,guide_strand,guide0_chr_pos,guide_cut_chr_pos,SNP_pos_in_guide,...,donor_seq_shift,donor_mut_pos_in_guide,donor_info_str,set_name,contains_excluded_seqs,pool,barcode_seq,barcode_id,oligo_seq,oligo_id
0,DHFR_10007,DHFR,457,A,[T],guide_20499,+,460,457,-4,...,14,-4,REF2ALT:A>T,DHFR,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,DHFR#guide_20499#guide_20499:REF2ALT:offset14:...
1,DHFR_10007,DHFR,457,A,[T],guide_20500,-,449,453,-7,...,-14,-7,REF2ALT:A>T,DHFR,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,DHFR#guide_20500#guide_20500:REF2ALT:offset-14...
2,DHFR_10008,DHFR,237,C,[T],guide_20501,+,241,238,-5,...,14,-5,REF2ALT:C>T,DHFR,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,DHFR#guide_20501#guide_20501:REF2ALT:offset14:...
3,DHFR_10104,DHFR,186,G,[C],guide_20502,-,180,184,-5,...,-14,-5,REF2ALT:G>C,DHFR,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,DHFR#guide_20502#guide_20502:REF2ALT:offset-14...
4,DHFR_00019,chr5,79929796,G,[C],guide_20503,+,79929793,79929790,2,...,14,2,REF2ALT:G>C,DHFR,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,DHFR#guide_20503#guide_20503:REF2ALT:offset14:...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,UROS_00020,chr10,127477526,G,[A],guide_21154,+,127477531,127477528,-6,...,14,-6,REF2ALT:G>A,UROS,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,UROS#guide_21154#guide_21154:REF2ALT:offset14:...
656,UROS_00020,chr10,127477526,G,[A],guide_21155,+,127477534,127477531,-9,...,14,-9,REF2ALT:G>A,UROS,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,UROS#guide_21155#guide_21155:REF2ALT:offset14:...
657,UROS_00029,chr10,127477556,T,[C],guide_21156,+,127477556,127477553,-1,...,14,-1,REF2ALT:T>C,UROS,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,UROS#guide_21156#guide_21156:REF2ALT:offset14:...
658,UROS_00029,chr10,127477556,T,[C],guide_21157,+,127477557,127477554,-2,...,14,-2,REF2ALT:T>C,UROS,False,-1,,,GTTGCAGTTAGCTAACAGGCCATGCNNNNNNNNNNNNGCATGCAGC...,UROS#guide_21157#guide_21157:REF2ALT:offset14:...


In [63]:
# number of oligos per gene
display(final_oligos_df.groupby('set_name').size())


set_name
DHFR       70
DPAGT1    118
NSDHL     118
PGK1      118
PKLR      118
UROS      118
dtype: int64

# Write all variant records to new VCF file for final oligo synthesis into library
- ERG/EG*: ergosterol set variants
- GXG: GxG main set variants
- TDH: TDH3 promoter variants
- VAL: Significant hits in at least 3 strains (set 1) and large effect variants identified from at least one strain (set 2)
- GXE: GxE main set variants
- HSP/HSX: HSP90 variants

In [78]:
# # REWRITE THIS
# gxg_known_fitness_vcf_file = "/home/users/rang/crispey3/costanzo_variants/Input/gxg_known_fitness_280820.vcf"

# candVars_allStrains_AFfilter = vcf.Reader(filename=candVars_allStrains_AFfilter_file)
# gxg_known_fitness_vcf = vcf.Writer(open(gxg_known_fitness_vcf_file, 'w'), candVars_allStrains_AFfilter)
# for record in pd.concat([prom_var_list, epi_var_list]):
#     gxg_known_fitness_vcf.write_record(record)
# gxg_known_fitness_vcf.close()

# Organize other control oligos to design
These oligos should have guide and donor sequences that have been pre-designed by 
- Neutral, non-targeting oligos (scrambled donors from old libraries, see "tech_donor_scramble")
- Neutral, targeting oligos (???)
- fitness ladder


In [None]:
# # from here we can try to focus on genes with at least 1 common variant
# num_of_vars_cutoff = 1
# genes_df_filtered = genes_df.query('num_of_vars_common>=@num_of_vars_cutoff')

# # are the genes in either class comparable in terms of their number of variants, or their variant density?
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.num_of_vars.describe()))
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.region_length.describe()))
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.var_density.describe()))

# # visualize genes by their number of variants relative to their length
# fig, ax = plt.subplots(figsize=(8,6))
# genes_df_filtered.query('num_of_vars>0').plot('region_length', 'num_of_vars', kind='scatter', ax=ax)
# ax.tick_params(labelsize=14)
# ax.set_xlabel('gene region length', fontsize=16)
# ax.set_ylabel('number of variants', fontsize=16)
# ax.set_title('number of variants vs gene region length', fontsize=16)
# plt.show()

# # distribution of allele frequencies
# fig, ax = plt.subplots(figsize=(8,6))
# ax.hist(np.log10(final_multiGuides.AF), bins=20)
# ax.tick_params(labelsize=14)
# ax.set_xlabel('log10(AF)', fontsize=16)
# ax.set_ylabel('frequency', fontsize=16)
# ax.set_title('asymmetrical distribution of variant AF', fontsize=16)
# plt.show()

# # distribution of variant annotation categories
# display((final_multiGuides.region.value_counts()/len(final_multiGuides)).head())


# # common variants median AF
# display(final_vars_selected.sort_values('AF').head(1008).AF.median())
# # rare variants median AF
# display(final_vars_selected.sort_values('AF').tail(1008).AF.median())
# # fraction essential
# display(hub_genes.essential.sum()/len(hub_genes))
# # fraction paralogs
# display(hub_genes.paralog.sum()/len(hub_genes))

# # variants in essential genes
# ve = final_vars_selected[final_vars_selected.assoc_gene.apply(lambda x: any([g in genes_df.loc[genes_df.essential].index for g in x]))]
# display(ve)
# # distribution of variant annotation types
# display((ve.region.value_counts()/len(ve)))

In [None]:
# # from here we can try to focus on genes with at least 1 common variant
# num_of_vars_cutoff = 1
# genes_df_filtered = genes_df.query('num_of_vars_common>=@num_of_vars_cutoff')

# # are the genes in either class comparable in terms of their number of variants, or their variant density?
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.num_of_vars.describe()))
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.region_length.describe()))
# display(genes_df_filtered.groupby('gene_class').apply(lambda x: x.var_density.describe()))

# # visualize genes by their number of variants relative to their length
# fig, ax = plt.subplots(figsize=(8,6))
# genes_df_filtered.query('num_of_vars>0').plot('region_length', 'num_of_vars', kind='scatter', ax=ax)
# ax.tick_params(labelsize=14)
# ax.set_xlabel('gene region length', fontsize=16)
# ax.set_ylabel('number of variants', fontsize=16)
# ax.set_title('number of variants vs gene region length', fontsize=16)
# plt.show()

# # distribution of allele frequencies
# fig, ax = plt.subplots(figsize=(8,6))
# ax.hist(np.log10(final_multiGuides.AF), bins=20)
# ax.tick_params(labelsize=14)
# ax.set_xlabel('log10(AF)', fontsize=16)
# ax.set_ylabel('frequency', fontsize=16)
# ax.set_title('asymmetrical distribution of variant AF', fontsize=16)
# plt.show()

# # distribution of variant annotation categories
# display((final_multiGuides.region.value_counts()/len(final_multiGuides)).head())

In [482]:
# cutoff = 1000
# # hub genes
# hub_genes = genes_df.query('gene_class=="hub" & num_of_vars_common<=0.5*num_of_vars').sort_values(['num_of_vars_common', 'num_of_vars_rare'], ascending=False)
# common_vars_cumsum = np.cumsum(hub_genes.num_of_vars_common)
# hub_genes = hub_genes[:(common_vars_cumsum>cutoff).idxmax()]
# print('Number of hub genes selected:', len(hub_genes))

# # peripheral genes
# peripheral_genes = genes_df.query('gene_class=="peripheral"').sort_values(['num_of_vars_common', 'num_of_vars_rare'], ascending=False)
# common_vars_cumsum = np.cumsum(peripheral_genes.num_of_vars_common)
# peripheral_genes = peripheral_genes[:(common_vars_cumsum>cutoff).idxmax()]
# print('Number of peripheral genes selected:', len(peripheral_genes))


# # with selected hub genes and peripheral genes, we iterate over these genes to select variants
# genes_list = hub_genes.index.tolist()# + peripheral_genes.index.tolist()
# var_list = []
# for gene in genes_list:
#     df = final_multiGuides[final_multiGuides['assoc_gene'].apply(lambda x: gene in x)].sort_values(['AF', 'AC'], ascending=False)
#     com_var = df.query('AF>=@af_cutoff_common').index.tolist()
#     rar_var = df.tail(len(com_var)).index.tolist()
    
#     # add selected variants to var_list
#     var_list += com_var
#     var_list += rar_var

# # select final list of variants that will be targeted
# final_vars_selected = final_multiGuides.query('var_id.isin(@var_list)')
# display(final_vars_selected)

# # distribution of variant annotation types
# display((final_vars_selected.region.value_counts()/len(final_vars_selected)))

# # how many oligos are we expecting from this list of variants?
# print("Number of oligos from hub-peripheral gene set:", int(final_vars_selected.num_of_oligos.sum()))

In [None]:
# v_list = ["YOR341W", "YLR430W"]
# fig, axes = plt.subplots(figsize=(12,6), ncols=2, nrows=1)

# for i in range(len(v_list)):
#     v = v_list[i]
#     ax = axes[i]
    
#     test = final[final['gene_group'].apply(lambda x: v in x)]
#     display(test.AF.describe())

#     ax.hist(np.log10(test.AF), bins=np.arange(-3, 0, 0.2))
#     ax.axvline(np.log10(af_cutoff), color='gray')
#     ax.tick_params(labelsize=14)
#     ax.set_xlabel('log10(AF)', fontsize=14)
#     ax.set_ylabel('frequency', fontsize=14)
#     ax.set_title('AF distribution in {}'.format(v), fontsize=16)
#     ax.set_xlim(-3, 0)
#     ax.set_ylim(0, 50)
# plt.show()

In [None]:
# viz = genes_df_filtered[['gene_class', 'positive_interactions', 'negative_interactions', 'num_of_vars_common', 'num_of_vars_rare']].copy()
# viz.loc[:,'num_interactions'] = viz['positive_interactions'] + viz['negative_interactions']
# viz.loc[:,'common_rare_ratio'] = viz['num_of_vars_common'] / viz['num_of_vars_rare']
# # viz.boxplot(column='common_rare_ratio', by='gene_class')

# # visualize genes by their common-rare ratio relative to their number of interactions 
# fig, ax = plt.subplots(figsize=(8,6))
# viz.plot('num_interactions', 'common_rare_ratio', kind='scatter', ax=ax)
# ax.tick_params(labelsize=14)
# ax.set_xlabel('number of interactions', fontsize=16)
# ax.set_ylabel('common-rare ratio', fontsize=16)
# ax.set_title('common-rare ratio vs number of interactions', fontsize=16)
# plt.show()

In [None]:
# # get annotation from annotated VCF from VEP
# variants_final_annotated_file = candVars_allStrains_AFfilter_file.replace(".vcf", "_annotated.vcf")
# variants_final_annotated = pd.Series([rec for rec in vcf.Reader(filename=variants_final_annotated_file)])
# display(variants_final_annotated)


# # how to handle multiple annotations?
# # if coding annotation detected, always take coding annotation
# # if noncoding, then select annotation associated with gene of interest
# noncoding_annotation_type = ['upstream_gene_variant', 'downstream_gene_variant']
# i=0
# for rec in variants_final_annotated:
#     if len(rec.INFO['CSQ'])>1:
#         i+=1
#         print(rec)
#         print([x.split('|')[:8] for x in rec.INFO['CSQ']])
#         var_anno_type = [x.split('|')[1] for x in rec.INFO['CSQ']]
#         if all([anno in noncoding_annotation_type for anno in var_anno_type]):
#             # intergenic
#             # determine which two nearest genes upstream and downstream
#         else:
#             # take coding annotation
#             rec.INFO['CSQ'] = [x for x in rec.INFO['CSQ'] if x.split('|')[1] not in noncoding_annotation_type]
#         print()
#         if i==5:
#             break



# noncoding_annotation_type = ['upstream_gene_variant', 'downstream_gene_variant']
# for rec in variants_final_annotated:
#     if len(rec.INFO['CSQ'])>4:
#         print(rec)
#         print(rec.INFO['CSQ'])
#         print([x for x in rec.INFO['CSQ'] if x.split('|')[1] not in noncoding_annotation_type])

In [None]:
# candVars_allStrains_AFfilter = pd.Series([rec for rec in vcf.Reader(filename=candVars_allStrains_AFfilter_file)])
# assemble = {}
# assemble['CHROM'] = [rec.CHROM for rec in candVars_allStrains_AFfilter]
# assemble['POS'] = [rec.POS for rec in candVars_allStrains_AFfilter]
# assemble['REF'] = [rec.REF for rec in candVars_allStrains_AFfilter]
# assemble['ALT'] = [rec.ALT for rec in candVars_allStrains_AFfilter]
# assemble['AC'] = [rec.INFO['AC'] for rec in candVars_allStrains_AFfilter]
# assemble['AN'] = [rec.INFO['AN'] for rec in candVars_allStrains_AFfilter]
# assemble['AF'] = [rec.INFO['AF'] for rec in candVars_allStrains_AFfilter]

# candVars_allStrains_AFfilter = pd.DataFrame.from_dict(assemble)
# display(candVars_allStrains_AFfilter)

In [None]:
# # counters
# total=0 # total num of variants processed
# edgeVar_count=0 # num of variants skipped due to incomplete sequence window at chromosome edge
# noNGG_count=0 # num of variants without NGG
# nonunique_count=0 # num of variants skipped with nonunique sequence in reference genome
# count_allStrains=0 # num of variants editable in all strains
# count_refOnly=0 # num of variants editable in reference strain only

# threshold = 10 # threshold value for searching NGGs
# window = 91 # window size to check for unique sequence

# # open starting set of potential candidate variants 
# gvcf = vcf.Reader(filename=candVars_file)
# # open candVars_allStrains and candVars_refOnly files for writing
# candVars_allStrains = vcf.Writer(open(candVars_allStrains_file, 'w'), gvcf)
# candVars_refOnly = vcf.Writer(open(candVars_refOnly_file, 'w'), gvcf)

# for record in gvcf:
#     total+=1
#     if total % 10 == 0:
#         print("{} candidate variants parsed.".format(total))
#     edits_in_ref_strain = False
#     edits_in_nonref_strain = False

#     ##### FILTERING #####
#     # 1. Skip variant if near chromosome edge
#     seq_window = ref_genome[record.CHROM][record.POS-window//2-1:record.POS+window//2]
#     if len(seq_window) != window:
#         edgeVar_count+=1
#         continue

#     # 2. is there an 'NGG' sequence around the center of the seq_window?
#     if not ngg_check(sequence=seq_window.seq, threshold=threshold):
#         noNGG_count+=1
#         continue

#     # 3. check genomes if sequence flanking variant (seq_window) is unique
#     unique_seq_status=[search_genome(seq_window.seq, genome) for genome in genomes_list]
#     if unique_seq_status[0]==1: # reference genome
#         edits_in_ref_strain = True
#         if all([i==1 for i in unique_seq_status]): # reference genome + other genomes
#             edits_in_nonref_strain = True
#     else:
#         nonunique_count+=1
#         continue

#     # 4. store variant info according to whether it can be edited in all strains, or just reference strain
#     if edits_in_ref_strain:
#         if edits_in_nonref_strain:
#             count_allStrains+=1
#             candVars_allStrains.write_record(record)
#         else:
#             count_refOnly+=1
#             candVars_refOnly.write_record(record)
#     else:
#         print("Variant skipped: Cannot be edited in reference strain") # sanity check step: should not appear in output as filters in previous steps should remove all such instances
    
    
    
    
# candVars_allStrains.close()
# candVars_refOnly.close()

# print()
# print("Total number of variants parsed:", total)
# print("Variants skipped due to incomplete sequence window:", edgeVar_count)
# print("Variants without NGG site:", noNGG_count)
# print("Variants with non-unique sequence region:", nonunique_count)
# print("Number of variants editable in all strains:", count_allStrains)
# print("Number of variants editable in reference strain only:", count_refOnly)




In [None]:
# as a sanity check, we can inspect the variants in candVar_allStrains to ensure
# they do not appear in our list of known variants for non-reference strains

# def ref_allele_check(record, var_df):
#     '''
#     checks var_df to see if a variant (alt allele) exists at the record's coordinates
#     returns True if no alt alleles found (i.e. ref allele present)
#     '''
#     ref_allele = len(var_df.query('(CHROM==@record.CHROM) & (POS==@record.POS)'))==0
#     return ref_allele

# # consolidated variant files put together with consolidate_(strain_name)var.py
# rm_var_file = "/home/users/rang/yeast/consolidated_vars/RM_variants_complete.txt"
# yps_var_file = "/home/users/rang/yeast/consolidated_vars/YPS128_variants_complete.txt"
# yjm_var_file = "/home/users/rang/yeast/consolidated_vars/YJM789_variants_complete.txt"

# # list containing strain names (also the order which they are processed at each filtering step)
# strains_list = ["BY4742", "RM11-1a", "YPS128", "YJM789"]

# # store variants files in pandas dataframes
# rm_var_df = pd.read_csv(rm_var_file, sep="\t", index_col=0)
# yps_var_df = pd.read_csv(yps_var_file, sep="\t", index_col=0)
# yjm_var_df = pd.read_csv(yjm_var_file, sep="\t", index_col=0)
# variants_df_list = [rm_var_df, yps_var_df, yjm_var_df]


#   # 0. Inspect if variant already exists in RM and/or YPS and/or YJM		# REDUNDANT STEP
#   check_ref_in_variants_dfs = [(record, df) for for df in variants_df_list]
#   num_of_cores = min(len(os.sched_getaffinity(0)), len(check_ref_in_variants_dfs))
#   with mp.Pool(num_of_cores) as pool:
#      ref_allele_status = pool.starmap(ref_allele_check, check_ref_in_variants_dfs)
#   if not all(ref_allele_status):
#      #print("At least one strain has non-reference allele at chrom {}, pos {}".format(record.CHROM, record.POS))
#   continue

In [11]:
# # ras genes that Jose is working on
# ras_genes = ['YOR101W',
# 'YNL098C',
# 'YLR310C',
# 'YLL016W',
# 'YBR140C',
# 'YOL081W',
# 'YJL005W',
# 'YGL248W',
# 'YOR360C',
# 'YLR178C',
# 'YIL033C',
# 'YJL164C',
# 'YPL203W',
# 'YKL166C',
# 'YER020W',
# 'YOR107W',
# 'YOR371C',
# 'YAL056W',
# 'YDL035C',
# 'YHR186C',
# 'YJR066W',
# 'YKL203C',
# 'YNL006W',
# 'YPL180W',
# 'YHR205W']