In [10]:
import pandas as pd

In [11]:
#Annotations from VEP for variants in all_variants.vcf
#this notebook trims downand processes these annotations to those relevant
#to the genes we are looking at, namely the ergosterol pathway genes
#and the genes within QTLs we chose. Without this trimming, each variant
#would have several annotations relative to multiple genes, which makes 
#downstream analysis more challenging.

raw_annotations = pd.read_csv('../data_tables/annotationsfull.txt', header=0, sep = '\t')

In [12]:
#list of ergosterol pathway genes used for library design
erg_genes = ['ERG10','ERG13','HMG1','HMG2','ERG12', 'ERG8',\
             'MVD1','IDI1','ERG20','ERG9','ERG1','ERG7','ERG11',\
             'NCP1','ERG24','ERG25','ERG26','ERG27','ERG28','ERG29',\
             'ERG6','ERG2','ERG3','ERG5','ERG4','MCR1']

In [13]:
#renaming and reshaping columns for future use merging into fitness dataframes

#Uploaded variation in this case was the variant IDs for our library
raw_annotations['var_id'] = raw_annotations['#Uploaded_variation']

#Splitting up the location column to get the chromosome number
raw_annotations['chrom'] = raw_annotations['Location'].str.split(':').str[0]

#Splitting up the location column to get the position of the variant along the chromosome
raw_annotations['SNP_chr_pos'] = raw_annotations['Location'].str.split(':').str[1].str.split('-').str[0]

#Allele in VEP parlance is the alternate allele relative to the s288C reference
raw_annotations['ALT'] = raw_annotations['Allele'] 

#SYMBOL is the column for the gene name, if the gene has a name
raw_annotations['Gene'] = raw_annotations['SYMBOL']

#If the gene does not have an official gene name, the ENSP column is the systematic name,
#e.g. YPR175W, and this is used as the gene column 
raw_annotations.loc[raw_annotations['Gene']=='-','Gene'] = raw_annotations['ENSP']

#Subsetting to relevant columns
annos = raw_annotations[['var_id','chrom','SNP_chr_pos', 'Gene', 'Consequence','ALT', 'CDS_position','Protein_position', 'Amino_acids', 'DISTANCE']]

In [14]:
#For variants within genes, the dsitance is annotated as a dash,
#but we want to use these distances as integers, so we set these to 0.
annos.loc[annos['DISTANCE']=='-', 'DISTANCE']=0

#converting distance to gene into an integer
annos['DISTANCE'] = annos['DISTANCE'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
#getting anotations for variants relative to the ergosterol pathway.
ergosterol_annotations = annos.loc[annos['Gene'].isin(erg_genes)]
ergosterol_annotations.to_csv('../data_tables/ergosterol_annotations.tsv', sep = '\t')

In [18]:
#Getting annotations for genes within the caffeine, cobalt chloride,
#and fluconazole QTL pools 

#loading in oligo information mastersheet
oli_info = pd.read_csv('/home/shian/CRISPEY3_fraserserver/tables/oligos_nonuniq_crispey3_GG_9bp_OLIGO_with_seq_primers.txt', sep = '\t')

#loading in assignment of each well to each library
libraries= pd.read_excel("/home/shian/CRISPEY3_fraserserver/tables/crispey3_libraries_by_pool_number.xlsx")

#loading in QTL information for relevant QTLs from Bloom 2019
qtl_list = pd.read_csv('../data_tables/qtl_list.tsv', sep= '\t')


#shrinking annotation file so that there is one (most relevant) annotation per variant

#getting single gene QTL genes for relevant conditions
targeted_genes = qtl_list[qtl_list['trait'].isin(['Caffeine;15mM;2','Fluconazole;100uM;2','Cobalt_Chloride;2mM;2'])]['Gene ID'].unique().tolist()

#subsetting annotations to those relative to QTL genes
annos = annos[annos['Gene'].isin(targeted_genes)]
#for noncoding variants, keeping the one with the shortest distance to a gene within the set of targeted genes
annos = annos[annos['Gene']!='-']
annos['mindist'] =annos.groupby('var_id')['DISTANCE'].transform('min')
annos = annos[(annos['DISTANCE']==annos['mindist'])]
annos.to_csv('../data_tables/QTL_pool_annotations.tsv',sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)
