<a href="https://colab.research.google.com/github/sanjaynagi/Ano-expressIR/blob/main/workflow/notebooks/plot-gene-expression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![image](https://raw.githubusercontent.com/sanjaynagi/ano-expressir/main/docs/logo.png) 


In [None]:
def plot_gene_expression(gene_id, title, analysis="gamb_colu_arab_fun", plot_type='strip', sort_by='agap', width=1600, height=None):
    """Plot fold changes of provided AGAP gene IDs from RNA-Seq 
    meta-analysis dataset

    Parameters
    ----------
    gene_id : str or list
      An AGAP identifier or list of AGAP identifiers, or AFUN if the analysis == 'fun'.
    analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"}
      which analysis to load gene expression data for. analyses with more species will have less genes
      present, due to the process of finding orthologs.
    title : str
      Plot title
    plot_type : {"strip", "boxplot"}, optional
      valid options are 'strip' or 'boxplot' 
    sort_by : {"median", "mean", "agap"}, optional
      sort by median/mean of fold changes (descending), or by AGAP
      identifier
    width : int
      Width in pixels of the plotly figure
    height: int, optional
      Height in pixels of the plotly figure. Defaults to automatic sizing
    """
    import pandas as pd
    import numpy as np
    import plotly.express as px
    import plotly.subplots as sp
      
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/ano-expressir/main/results/fcs.{analysis}.tsv", sep="\t").iloc[:, :-1]
    count_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/ano-expressir/main/results/log2counts.{analysis}.tsv", sep="\t")
    comp_metadata = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/ano-expressir/main/config/comparison_metadata.tsv", sep="\t")
    sample_metadata = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/ano-expressir/main/config/sample_metadata.tsv", sep="\t").rename(columns={'colData':'sampleID'})

    fam_fc_data = fc_data.query("GeneID in @gene_id").copy()
    fam_count_data = count_data.query("GeneID in @gene_id").copy()

    if sort_by == 'median':
        sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmedian, axis=1)).values
    elif sort_by == 'mean':
        sort_idxs = np.argsort(fam_fc_data.set_index(['GeneID', 'GeneName']).apply(np.nanmean, axis=1)).values
    elif sort_by == 'agap':
        sort_idxs = np.argsort(fam_fc_data['GeneID'].values)[::-1] 
        
    fam_fc_data = fam_fc_data.iloc[sort_idxs, :].copy()
    fam_count_data = fam_count_data.set_index("GeneID").loc[fam_fc_data['GeneID'].to_list(), :].reset_index().copy()

    fam_fc_data.loc[:, 'Label'] = [id_ + " | " + name if name != "" else id_ for id_, name in zip(fam_fc_data['GeneID'].fillna(""), fam_fc_data['GeneName'].fillna(""))]
    fam_fc_data = fam_fc_data.drop(columns=['GeneName', 'GeneID']).melt(id_vars='Label', var_name='comparison', value_name='log2FC')
    fam_count_data = fam_count_data.melt(id_vars='GeneID', var_name='sampleID', value_name='log2_counts')
    fam_fc_data.loc[:, 'comparison'] = fam_fc_data['comparison'].str.replace("_log2FoldChange", "")
    fam_fc_data = fam_fc_data.merge(comp_metadata, how='left')
    fam_count_data = fam_count_data.merge(sample_metadata, how='left').assign(counts = lambda x: np.round(2**x.log2_counts, 0))

    if not height:
      height = np.min([fam_fc_data.shape[0]*12, 2500])

    myplot = px.box if plot_type == 'boxplot' else px.strip
    figure1 = myplot(
          fam_fc_data, 
          y='Label', 
          x='log2FC', 
          color='species',
          title="title", 
          hover_data=['resistant', 'susceptible', 'species', 'country'],
          template='ggplot2'
        )

    figure2 = myplot(
        fam_count_data, 
        x='counts', 
        y='GeneID', 
        color='species', 
        orientation='h', 
        hover_data=['sampleID', 'species', 'country'],
        template='ggplot2',
        )

    for i in range(len(figure2.data)):
      figure2.data[i]['showlegend'] = False
      
    figure1_traces = []
    figure2_traces = []
    for trace in range(len(figure1["data"])):
        figure1_traces.append(figure1["data"][trace])
    for trace in range(len(figure2["data"])):
        figure2_traces.append(figure2["data"][trace])

    #Create a 1x2 subplot
    final_figure = sp.make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) 
    # Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
    for traces in figure1_traces:
        final_figure.append_trace(traces, row=1, col=1)
    for traces in figure2_traces:
        final_figure.append_trace(traces, row=1, col=2)

    # reset boxmode to group so species are separate, resize plots, set axes titles, labels and vlines
    final_figure.layout['boxmode'] = 'group'
    final_figure.layout['xaxis2']['domain'] = (0.65, 1.0)
    final_figure.update_layout(title_text=title, title_x=0.5, width=width, height=height)
    final_figure.update_yaxes(title_text="Gene", row=1, col=1, title_font = {"size": 18}, tickfont={"size":14}),
    final_figure.update_yaxes(showticklabels=False, row=1, col=2)
    final_figure.update_xaxes(title_text="log2 fold change", row=1, col=1, title_font = {"size": 18})
    final_figure.update_xaxes(title_text="counts", row=1, col=2, title_font = {"size": 18})
    for i in [1,2]: final_figure.add_vline(0,  line_width=1, line_dash="dash", line_color="grey", row=1, col=i)
    
    return(final_figure)

In [None]:
#@title **Ano-expressIR** { run: "auto" }   
#@markdown This notebook produces interactive strip and boxplots with plotly, to summarise gene expression across 35 *Anopheles* insecticide resistant v susceptible RNA-Sequencing experiments. There are four different analyses to select
#@markdown 'gamb_colu', 'gamb_colu_arab', 'gamb_colu_arab_fun' and 'fun'.  
#@markdown each which integrates a different combination of Anopheles species – ‘gamb_colu’ (An. gambiae and An. coluzzii), ‘gamb_colu_arab’ (adding An. arabiensis), ‘gamb_colu_arab_fun’ (adding An. funestus), and finally ‘fun’ (An. funestus alone).  
#@markdown As you include more species, there are less genes overall in the dataset, as we can only include genes with one-to-many orthologs between species.
#@markdown   
#@markdown Requesting feedback and ideas for how to explore the data.

analysis = "gamb_colu_arab_fun" #@param ['gamb_colu', 'gamb_colu_arab', 'gamb_colu_arab_fun', 'fun']
GeneID = "AGAP006227" #@param {type:"string"} 
plot_type = "strip" #@param ['strip', 'boxplot']

plot_gene_expression(analysis=analysis,
              gene_id=GeneID, 
              title="", 
              plot_type=plot_type, 
              height=300)

We can provide a list of genes, and sort by either AGAP identifier, or by the median fold-change. 

In [None]:
plot_gene_expression(
    analysis="gamb_colu_arab_fun", 
    gene_id=["AGAP006222", "AGAP006227", "AGAP006228"], 
    sort_by='agap', 
    title="Coeae1f", 
    plot_type='strip', 
    height=300
)

You can also produce a boxplot, although the hovertext doesnt quite work as expected.

In [None]:
plot_gene_expression(
    analysis='gamb_colu_arab_fun', 
    gene_id=["AGAP006222", "AGAP006227", "AGAP006228"], 
    sort_by='median', 
    title="Coeae1f", 
    plot_type='boxplot', 
    height=300
    )