In [None]:
%pip install anoexpress malariagen_data kaleido -U -qq

In [None]:
import anoexpress as xpress
import pandas as pd
import malariagen_data
import numpy as np
import plotly.express as px
import kaleido

In [None]:
ag3 = malariagen_data.Ag3()

## Gene expression x genetic diversity

In [None]:
gd_df = pd.read_csv(f"pi_pn_ps_new.tsv", sep="\t").query("gene_id != 'gene_id'")

In [None]:
for col in gd_df.columns[1:5]:
    gd_df[col] = gd_df[col].astype(float)
gd_df = gd_df.assign(pn_ps_ratio=lambda x:x.pn/x.ps)

In [None]:
counts_df = xpress.data(data_type="log2counts", analysis='gamb_colu_arab_fun')
metadata = xpress.sample_metadata(analysis='gamb_colu_arab_fun')

Lets first look at gene expression correlations between gambiae, coluzzii, and arabiensis.

In [None]:
species = metadata.species.unique()

sp_counts = []
for sp in species:
    ids = metadata.query("species == @sp").sampleID
    sp_counts.append(counts_df.loc[:, ids].apply(np.median, axis=1).to_frame().rename(columns={0:sp}))

df = pd.concat(sp_counts, axis=1)

In [None]:
import seaborn as sns

In [None]:
import itertools
for x, y in itertools.combinations(species, 2):

    fig = px.scatter(df,
                     x=x,
                     y=y,
                     opacity=0.3,
                     template='simple_white',
                     width=425,
                     height=400,
                     labels={x:f"<i>An. {x}</i>",
                             y:f"<i>An. {y}</i>"})#,
                  #   title=f"<i>An. {x}</i> v <i>An. {y}</i> log2 counts")
    fig.update_layout(font=dict(size=16), xaxis=dict(mirror=True), yaxis=dict(mirror=True))
    fig.update_traces(marker=dict(size=6,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
    fig.write_image(f"corr_{x}_{y}.png")
    fig.show()

In [None]:
counts_df = xpress.data(data_type="log2counts", analysis='gamb_colu_arab')

In [None]:
median_counts = counts_df.apply(np.median, axis=1)
median_counts

In [None]:
median_counts = median_counts.to_frame().rename(columns={0:'median_log2counts'})

#.assign(expression_percentile=bins_with_labels)

In [None]:
gd_df = gd_df.rename(columns={'gene_id':'GeneID'}).merge(median_counts.reset_index())

In [None]:
# Define the bin edges as percentiles
bin_edges = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
# Use pd.cut to create the bins
bins = pd.cut(gd_df.median_log2counts, bins=20, labels=False)
# Since you want bins in the format "0-5%, 5-10%, ...", you can create labels accordingly
labels = [f"{bin_edges[i]}-{bin_edges[i+1]}%" for i in range(len(bin_edges) - 1)]
# Add labels to the bins
bins_with_labels = [labels[i] for i in bins]

In [None]:
gd_df = gd_df.assign(expression_percentile=bins_with_labels)

In [None]:
import re

def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

labels_order = natural_sort(np.unique(bins_with_labels))

In [None]:
# Define the custom sorting order
custom_order = ['coluzzii', 'gambiae', 'arabiensis']

# Convert the 'species' column to a categorical data type with the custom order
gd_df['sp'] = pd.Categorical(gd_df['sp'], categories=custom_order, ordered=True)

# Sort the DataFrame based on the 'species' column
gd_df = gd_df.sort_values('sp')

In [None]:
fig = px.box(gd_df,
             x='expression_percentile',
             y='pn_ps_ratio',
             color='sp',
             labels={'pn_ps_ratio': 'pN/pS','expression_percentile':'Expression rate (percentile)'},
             template='simple_white',
             height=400,
             width=700,
            # points='suspectedoutliers',
             title='Purifying Selection')

fig.update_xaxes(categoryorder='array', categoryarray=labels_order)
fig.update_yaxes(range=[-0.1, 10])
fig.update_layout(showlegend=False)
fig.write_image("pn_ps_expression.png", scale=2)
fig

In [None]:
fig = px.box(gd_df.dropna(),
             x='expression_percentile',
             y='theta',
             color='sp',
             labels={'theta': 'Wattersons Theta','expression_percentile':'Expression rate (percentile)'},
             template='simple_white',
             height=400,
             width=700,
             title='Diversity x Expression')

fig.update_xaxes(categoryorder='array', categoryarray=labels_order)
fig.write_image("theta_expression.png", scale=2)
fig

In [None]:
fig = px.box(gd_df.dropna(),
             x='expression_percentile',
             y='pi',
             color='sp',
             labels={'pi': 'Nucleotide diversity','expression_percentile':'Expression rate (percentile)'},
             template='simple_white',
            height=400,
             width=700,
             title='Diversity x Expression')
fig.update_layout(showlegend=False)
fig.update_xaxes(categoryorder='array', categoryarray=labels_order)
fig.write_image("pi_expression.png", scale=2)
fig

Lets calculate for each gene the total CDS length

In [None]:
ag3 = malariagen_data.Ag3()
gff = ag3.genome_features()

In [None]:
# from tqdm.notebook import tqdm

# cds_lengths = []
# gene_ids = []
# for gene_id in tqdm(gd_df.dropna().GeneID.unique()):

#     df = gff.query(f"Parent == '{gene_id}-RA' and type == 'CDS'")
#     if df.empty:
#         df = gff.query(f"Parent == '{gene_id}-RB' and type == 'CDS'")

#     df = df.assign(exon_size=lambda x: np.abs(x.end-x.start))
#     cds_length = df.exon_size.sum()
#     cds_lengths.append(cds_length)
#     gene_ids.append(gene_id)

# np.save("cds_lengths.npy", cds_lengths)
# np.save("gene_ids.npy", gene_ids)

In [None]:
cds_df = pd.DataFrame({'GeneID':np.load("gene_ids.npy"), 'cds_length':np.load("cds_lengths.npy")})

In [None]:
gd_df = gd_df.dropna().merge(cds_df)
gd_df = gd_df.assign(cds_ratio=lambda x:1000/x.cds_length)
gd_df = gd_df.assign(non_synon_count_cds_kb=lambda x:x.pn*x.cds_ratio,
                  synon_count_cds_kb=lambda x:x.ps*x.cds_ratio)

In [None]:
#np.save("gd_df.npy", gd_df)

fig = px.box(gd_df,
             x='expression_percentile',
             y='non_synon_count_cds_kb',
             color='sp',
             labels={'non_synon_count_cds_kb': 'count per CDS kb',
                     'expression_percentile':'Expression rate (percentile)'},
             template='simple_white',
             width=700,
             height=400,
             title='Nonsynonymous')
fig.update_layout(showlegend=False)
fig.update_xaxes(categoryorder='array', categoryarray=labels_order)
fig.write_image("non_synon_expression.png", scale=2)
fig

In [None]:
fig = px.box(gd_df,
             x='expression_percentile',
             y='synon_count_cds_kb',
             color='sp',
             labels={'synon_count_cds_kb': 'count per CDS kb',
                     'expression_percentile':'Expression rate (percentile)'},
             template='simple_white',
             width=700,
             height=400,
             title='Synonymous')
fig.update_layout(showlegend=False)
fig.update_xaxes(categoryorder='array', categoryarray=labels_order)
fig.write_image("synon_expression.png", scale=2)
fig