<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/plot-gene-expression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
! git clone https://github.com/sanjaynagi/rna-seq-meta.git

import pandas as pd
import numpy as np
import plotly.express as px

def plotly_gene_fcs(gene_ids, title, plot_type='strip', sort_by='median', fc_path="rna-seq-meta/results/fc_data.tsv.gz", meta_path="rna-seq-meta/config/comparison_metadata.tsv", width=1000, height=None):
  """
  plots fold changes of provided AGAP gene IDs
  """
  # load metadata
  metadata = pd.read_csv(meta_path, sep="\t")  
  # load fold change data and remove gene description column
  fc_data = pd.read_csv(fc_path, sep="\t")
  #pval_data = pd.read_csv("rna-seq-meta/results/pval_data.tsv", sep="\t")
  fc_data = fc_data.iloc[:, :-1]

  fam_fc_data = fc_data.query("GeneID in @gene_ids").copy()

  if sort_by == 'median':
    sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmedian, axis=1)).values[::-1]
  elif sort_by == 'mean':
    sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmean, axis=1)).values[::-1]
  elif sort_by == 'agap':
    sort_idxs = np.argsort(fam_fc_data['GeneID'].values)[::-1] 
  fam_fc_data = fam_fc_data.iloc[sort_idxs, :]
    
  fam_fc_data.loc[:, 'Label'] = [id_ + " | " + name if name != "" else id_ for id_, name in zip(fam_fc_data['GeneID'].fillna(""), fam_fc_data['GeneName'].fillna(""))]
  fam_fc_data =fam_fc_data.drop(columns=['GeneName', 'GeneID']).melt(id_vars='Label', var_name='comparison', value_name='log2FC')
  fam_fc_data.loc[:, 'comparison'] = fam_fc_data['comparison'].str.replace("_log2FoldChange", "")
  fam_fc_data = fam_fc_data.merge(metadata, how='left')
  fam_fc_data.loc[:, 'log2FC'] *= -1 # invert the FCs (currently > 0 log2FC = overexpression in susceptible)

  if not height:
    height = np.min([fam_fc_data.shape[0]*12, 2500])
  
  my_plot = px.strip if plot_type == 'strip' else px.box
  fig = my_plot(
      fam_fc_data, 
      y='Label', 
      x='log2FC', 
      color='species',
      title=title, 
      hover_data=['resistant', 'susceptible', 'species', 'country'],
      width=width, 
      height=height,
      template='ggplot2'
  )
  fig.update_layout(titlefont=dict(size=20), xaxis_range=[-4,6],     xaxis_title="log2 Fold Change", yaxis_title="Gene")
  fig.add_vline(0,  line_width=1, line_dash="dash", line_color="grey")
  fig.show()

fatal: destination path 'rna-seq-meta' already exists and is not an empty directory.


In [None]:
#@title **RNA-Seq-Meta** { run: "auto" }
GeneID = "AGAP006227" #@param {type:"string"} 
plot_type = "strip" #@param ['strip', 'boxplot']

plotly_gene_fcs(gene_ids=GeneID, title="", plot_type=plot_type, height=300)


This notebook produces interactive strip and boxplots with plotly, to summarise gene expression across *An. gambiae* RNA-Sequencing experiments. Still in development. 
  
Currently *An. gambiae* is not split into *gambiae* and *coluzzii*. You can toggle which species are displayed by clicking the legend. Because *An. funestus* *italicised text* are included, only genes with orthologs are present.

Requesting feedback and ideas.

In [30]:
plotly_gene_fcs(gene_ids=["AGAP006222", "AGAP006227", "AGAP006228"], sort_by='agap', title="Coeae1f", plot_type='strip', height=300)

You can also produce a boxplot, although the hovertext is doesnt work properly.

In [None]:
plotly_gene_fcs(gene_ids=["AGAP006227", "AGAP006228"], title="Coeae1f", plot_type='boxplot', height=300)

In [3]:
ls

[0m[01;34mrna-seq-meta[0m/  [01;34msample_data[0m/


In [12]:
gene_ids = ['AGAP006227', 'AGAP006228', 'AGAP004707']

In [13]:
# load metadata
metadata = pd.read_csv("rna-seq-meta/config/sample_metadata.tsv", sep="\t")  
# load fold change data and remove gene description column
fc_data = pd.read_csv("rna-seq-meta/results/fc_data.tsv.gz", sep="\t")
#pval_data = pd.read_csv("rna-seq-meta/results/pval_data.tsv", sep="\t")
fc_data = fc_data.iloc[:, :-1]

fam_fc_data = fc_data.query("GeneID in @gene_ids").copy()

In [24]:

np.argsort(fam_fc_data['GeneID'].values)

array([1, 0])

In [19]:
sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmedian, axis=1)).values[::-1]
fam_fc_data.iloc[sort_idxs, :]

Unnamed: 0,GeneID,Tiefora_v_Ngousso_log2FoldChange,Gou_v_Moz_log2FoldChange,Ban_v_BanS_log2FoldChange,BanRe_v_BanS_log2FoldChange,Bak_v_Kisumu_log2FoldChange,VK7_v_Kisumu_log2FoldChange,Cameroon_v_Ngousso_log2FoldChange,Chad_v_Ngousso_log2FoldChange,Niger_v_Ngousso_log2FoldChange,...,Bouake_gamb_unexp_v_Kisumu_log2FoldChange,Bouake_colu_exp_v_Ngousso_log2FoldChange,Bouake_colu_unexp_v_Ngousso_log2FoldChange,ContTia_v_Ngousso_log2FoldChange,DeltTia_v_Ngousso_log2FoldChange,MalaTia_v_Ngousso_log2FoldChange,PiriTia_v_Ngousso_log2FoldChange,BusiaParental_v_Kisumu_log2FoldChange,BusiaSurvivors_v_Kisumu_log2FoldChange,GeneName
2178,AGAP006227,0.46,0.04,1.08,0.32,-0.29,-0.05,0.33,0.05,0.04,...,-2.59,-0.26,-0.68,-1.66,-0.04,-1.58,-1.14,-0.8,-0.67,
1921,AGAP006228,0.44,0.17,0.51,0.06,0.23,0.26,0.12,-0.37,-0.31,...,0.08,0.64,0.71,-0.33,0.09,-0.55,-0.38,-0.64,-0.84,COEAE2F


In [None]:
  fc_medians = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:'median log2 Fold Change'})
  fc_medians.sort_values('median log2 Fold Change', ascending=False)