In [None]:
!pip install anoexpress -q

![image](https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/docs/logo.png)


In [None]:
import anoexpress as xpress
import pandas as pd

In [None]:
#@title **AnoExpress** { run: "auto" }
#@markdown This notebook produces interactive strip and boxplots with plotly, to summarise gene expression across 35 *Anopheles* insecticide resistant v susceptible RNA-Sequencing experiments. There are four different analyses to select
#@markdown 'gamb_colu', 'gamb_colu_arab', 'gamb_colu_arab_fun' and 'fun'.
#@markdown each which integrates a different combination of Anopheles species – ‘gamb_colu’ (An. gambiae and An. coluzzii), ‘gamb_colu_arab’ (adding An. arabiensis), ‘gamb_colu_arab_fun’ (adding An. funestus), and finally ‘fun’ (An. funestus alone).
#@markdown As you include more species, there are less genes overall in the dataset, as we can only include genes with one-to-many orthologs between species.

analysis = "gamb_colu_arab_fun" #@param ['gamb_colu', 'gamb_colu_arab', 'gamb_colu_arab_fun', 'fun']
GeneID = "AGAP002865" #@param {type:"string"}
plot_type = "strip" #@param ['strip', 'boxplot']
include_microarray_data = True #@param {type:'boolean'}

xpress.plot_gene_expression(
    analysis=analysis,
    gene_id=GeneID,
    microarray=include_microarray_data,
    title="",
    plot_type=plot_type,
    height=300)

We can provide a list of genes, and sort by either AGAP identifier, or by the median fold-change. We can also filter out values that are not significant based on a `pvalue_filter` parameter.

In [None]:
def summarise_expression(analysis, gene_id, microarray):

  # load fc data
  fc_data = xpress.data(
    analysis=analysis,
    data_type='fcs',
    gene_id=gene_id,
    microarray=microarray,
  )
  # load pval data
  pval_data = xpress.data(
    analysis=analysis,
    data_type='pvals',
    gene_id=gene_id,
    microarray=microarray,
  )
  # load pval data
  count_data = xpress.data(
    analysis=analysis,
    data_type='log2counts',
    gene_id=gene_id,
    microarray=microarray,
  )
  # load metadata
  metadata = xpress.metadata(analysis=analysis, microarray=microarray)
  sample_metadata = xpress.sample_metadata(analysis=analysis)

  # add species to pval data and melt
  pval_data = pval_data.T.assign(species=metadata.species.to_list())
  pval_data = pval_data.reset_index(drop=True)
  pval_data = pval_data.melt(id_vars='species', value_name='count')

  # add species to fc data and melt
  fc_data = fc_data.T.assign(species=metadata.species.to_list())
  fc_data = fc_data.reset_index(drop=True)
  fc_data = fc_data.melt(id_vars='species', value_name='count')

  # add species to fc data and melt
  count_data = count_data.T.assign(species=sample_metadata.species.to_list())
  count_data = count_data.reset_index(drop=True)
  count_data = count_data.melt(id_vars='species', value_name='count')

  # get up, down and sig genes
  fc_up = fc_data.groupby(['species', 'GeneID'], group_keys=True).apply(lambda x: x > 0)
  fc_down = fc_data.groupby(['species', 'GeneID'], group_keys=True).apply(lambda x: x < 0)
  pval_sig = pval_data.groupby(['species', 'GeneID'], group_keys=True).apply(lambda x: x < 0.05)

  # get intersection of up and sig, down and sig
  fc_up_sig = fc_up & pval_sig
  fc_down_sig = fc_down & pval_sig

  # count total up and down sig
  fc_up_sig_summary = fc_up_sig.groupby(['species', 'GeneID']).agg({'count':'sum'}).rename(columns={'count':'up_sig'})
  fc_down_sig_summary = fc_down_sig.groupby(['species', 'GeneID']).agg({'count':'sum'}).rename(columns={'count':'down_sig'})
  count_summary = fc_up_sig.groupby(['species', 'GeneID']).agg({'count':'count'}).rename(columns={'adj_pval':'total'})

  # mean, median, sd per species
  median_fc = 2**fc_data.groupby(['species', 'GeneID'], group_keys=True).median().rename(columns={'count':'median_fc'})
  mean_fc = 2**fc_data.groupby(['species', 'GeneID'], group_keys=True).mean().rename(columns={'count':'mean_fc'})
  std_fc = 2**fc_data.groupby(['species', 'GeneID'], group_keys=True).std().rename(columns={'count':'sd_fc'})

  # mean, median, sd counts per species
  median_count = 2**count_data.groupby(['species', 'GeneID'], group_keys=True).median().rename(columns={'count':'median_count'})
  mean_count = 2**count_data.groupby(['species', 'GeneID'], group_keys=True).mean().rename(columns={'count':'mean_count'})
  std_count = 2**count_data.groupby(['species', 'GeneID'], group_keys=True).std().rename(columns={'count':'sd_count'})

  # concat
  expr_summary = pd.concat([fc_up_sig_summary, fc_down_sig_summary, count_summary, median_fc, mean_fc, std_fc, median_count, mean_count, std_count], axis=1)

  return expr_summary.round(2)

In [None]:
expr_summary = summarise_expression(analysis='gamb_colu_arab', gene_id=["AGAP006227", "AGAP006228"], microarray=True)
expr_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,up_sig,down_sig,count,median_fc,mean_fc,sd_fc,median_count,mean_count,sd_count
species,GeneID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
arabiensis,AGAP006227,4,0,15,1.08,1.26,1.56,373.51,334.71,1.46
arabiensis,AGAP006228,5,0,15,1.15,1.16,1.25,760.08,646.63,1.69
coluzzii,AGAP006227,8,7,31,1.04,1.45,2.3,372.22,432.61,2.14
coluzzii,AGAP006228,16,6,31,1.32,1.32,1.65,754.83,844.19,1.93
gambiae,AGAP006227,7,0,8,1.65,2.25,1.85,319.57,410.59,2.31
gambiae,AGAP006228,3,0,8,1.19,1.15,1.23,590.18,563.87,1.26


In [None]:
expr_summary.to_csv("coeae1f_2f.expr.summary.tsv", sep="\t")

In [None]:
import plotly.graph_objects as go

In [None]:
def plot_donut_summary(expr_summary, species, GeneID):
  values = expr_summary.query(f"species == '{species}' and GeneID == 'AGAP006227'")[['up_sig', 'down_sig', 'count']].values[0]
  values[2] = values[2]-(values[0] + values[1])
  labels = ['over-expressed', 'under-expressed', 'non-significant']
  colors = ['gold', 'yellow', 'darkslategray']


  # Use `hole` to create a donut-like pie chart
  fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
  fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                    marker=dict(colors=colors, line=dict(color='#000000', width=2)))
  fig.show()

In [None]:
import numpy as np