<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/plot-families-expression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
! git clone https://github.com/sanjaynagi/rna-seq-meta.git

import pandas as pd
import numpy as np
import plotly.express as px


def plotly_gene_family_fcs(gene_identifier, title, plot_type='strip', sort_by='median', fc_path="rna-seq-meta/results/fc_data.tsv.gz", meta_path="rna-seq-meta/config/comparison_metadata.tsv", width=1000, height=None):
  """
  plots fold changes of genes belonging to GO term or pfam protein domain
  """
  # Read in .csv file containing pfam and go terms
  pfam_df = pd.read_csv("rna-seq-meta/resources/Anogam_long.pep_Pfamscan.seqs.gz", sep="\s+", header=None)
  go_df = pd.read_csv("rna-seq-meta/resources/Anogam_long.pep_eggnog_diamond.emapper.annotations.GO.gz", sep="\t", header=None)
  pfam_df.columns = ["transcript", "pstart", "pend", "pfamid", "domain", "domseq"]
  go_df.columns = ['transcript', 'GO_terms']

  gene_annot_df = pfam_df.merge(go_df)
  gene_annot_df.loc[:, 'gene_id'] = gene_annot_df.loc[:, 'transcript'].str.replace("Anogam_", "").str.replace("-R[A-Z]", "")
  gene_ids = gene_ids_from_annotation(gene_annot_df, gene_identifier)

  # load metadata
  metadata = pd.read_csv(meta_path, sep="\t")  
  # load fold change data and remove gene description column
  fc_data = pd.read_csv(fc_path, sep="\t")
  #pval_data = pd.read_csv("rna-seq-meta/results/pval_data.tsv", sep="\t")
  fc_data = fc_data.iloc[:, :-1]

  fam_fc_data = fc_data.query("GeneID in @gene_ids").copy()

  if sort_by == 'median':
    sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmedian, axis=1)).values[::-1]
  elif sort_by == 'mean':
    sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmean, axis=1)).values[::-1]
  elif sort_by == 'agap':
    sort_idxs = np.argsort(fam_fc_data['GeneID'].values)[::-1] 
  fam_fc_data = fam_fc_data.iloc[sort_idxs, :]
    
  fam_fc_data.loc[:, 'Label'] = [id_ + " | " + name if name != "" else id_ for id_, name in zip(fam_fc_data['GeneID'].fillna(""), fam_fc_data['GeneName'].fillna(""))]
  fam_fc_data =fam_fc_data.drop(columns=['GeneName', 'GeneID']).melt(id_vars='Label', var_name='comparison', value_name='log2FC')
  fam_fc_data.loc[:, 'comparison'] = fam_fc_data['comparison'].str.replace("_log2FoldChange", "")
  fam_fc_data = fam_fc_data.merge(metadata, how='left')
  fam_fc_data.loc[:, 'log2FC'] *= -1 # invert the FCs (currently > 0 log2FC = overexpression in susceptible)

  if not height:
    height = np.min([fam_fc_data.shape[0]*5, 2500])
  
  my_plot = px.strip if plot_type == 'strip' else px.box
  fig = my_plot(
      fam_fc_data, 
      y='Label', 
      x='log2FC', 
      color='species',
      title=title, 
      hover_data=['resistant', 'susceptible', 'species', 'country'],
      width=width, 
      height=height,
      template='ggplot2'
  )
  fig.update_layout(titlefont=dict(size=20), xaxis_range=[-4,6],     xaxis_title="log2 Fold Change", yaxis_title="Gene")
  fig.add_vline(0,  line_width=1, line_dash="dash", line_color="grey")
  fig.show()

def gene_ids_from_annotation(gene_annot_df, annotation):
    if isinstance(annotation, list):
        gene_list = np.array([])
        if annotation[0].startswith("GO"):
            for go in annotation:
              ids = gene_annot_df.query(f"GO_terms.str.contains('{go}', na=False)", engine='python')['gene_id'].to_numpy()
              gene_list = np.hstack([gene_list, ids])
            return(np.unique(gene_list))
        else:
          for dom in annotation:
              ids = gene_annot_df.query("domain == @annotation")['gene_id'].to_numpy()
              gene_list = np.hstack([gene_list, ids])
          return(np.unique(gene_list))
    else:
        if annotation.startswith("GO"): 
          return(gene_annot_df.query(f"GO_terms.str.contains('{annotation}', na=False)", engine='python')['gene_id'].to_numpy())
        else:
          return(gene_annot_df.query("domain == @domain")['gene_id'].to_numpy())


fatal: destination path 'rna-seq-meta' already exists and is not an empty directory.


**Across gene families, pfam domains and GO terms**

In this notebook, we can plot gene expression for all genes assigned a given pfam domain, or GO term.  

If you have ideas for genesets to use or for improvements to the plots, please let me know :)

In [67]:
plotly_gene_family_fcs(gene_identifier="GO:0019825", title="oxygen binding", plot_type='strip', sort_by='median', height=400)


The default value of regex will change from True to False in a future version.



**All gene families linked to insecticide resistance**

In [68]:
# a dict with gene families and their respective Pfam domain for extracting
gene_fams = {'CSP': 'OS-D',
            'Cytochrome P450s':'p450', 
             'GSTs':['GST_N', 'GST_N_3', 'GST_C'], 
             'ABC-transporters':['ABC_membrane', 'ABC_tran'],
            'Carboxylesterases': 'COesterase', 
             'UGTs': 'UDPGT',
             'Odorant binding proteins':'PBP_GOBP', 
             'Olfactory receptors':'7tm_6', 
             'Ionotropic receptors':['Lig_chan','7tm_1'],
             'Gustatory receptors': '7tm_7',
            'Fatty acid synthases':'ketoacyl-synt',
            'FA Elongase':'ELO',
            'FA desaturase':'FA_desaturase',
            'FA reductase':'NAD_binding_4',
            }

for name, domain in gene_fams.items():
    
    plotly_gene_family_fcs(domain, title=name, sort_by='median', plot_type='strip')


The default value of regex will change from True to False in a future version.

