# Tutorial: Transcriptome annotation

In this tutorial we will filter and visualize genomic features by read mappability 

Briefly, we will do the following:
- build a transcriptome of canonical protein coding genes (chr20) from gencode annotations
- annotate all gene annotations with a bedgraph file containing umap mappability values.
- Query some of the data and demonstrated the slice_from_parent() functionality
- Visualize some of the data with matplotlib

*Required resources:*
- Human genome FASTA (GRCh38), accessible at https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.26/
- Full gencode annotation gff3 file (sorted), available at https://www.gencodegenes.org/human/
- Full GRCh38.k24.umap mappability bedgraph file, available at https://bismap.hoffmanlab.org/

In [None]:
# set path and load rnalib
import os, pathlib, platform
rnalib_SRC=pathlib.Path('/Users/niko/projects/rnalib/') 
os.chdir(rnalib_SRC)
# install libraries. Recommended to run in a venv here!
#!{sys.executable} -m pip install -r requirements.txt 
display(f"Running rnalib on python {platform.python_version()}. Using rnalib code from {rnalib_SRC}")
# load rnalib
import rnalib as pg
from rnalib import SEP, display_textarea
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import traceback
import math
import random

First, we download the required resources.
NOTE that this needs bedtools, samtools and htslib (bgzip, tabix) installed.
Total size of the downloaded data (for all tutorials) is ~150M. Files are only downloaded if not existing already in the `notebooks/large_test_resources/` directory.

In [None]:
import traceback
from rnalib.testdata import download_bgzip_slice
outdir=rnalib_SRC / 'notebooks/large_test_resources' # update to your preferred location
large_test_resources = {
    "outdir": f"{outdir}", # update to your preferred location
    "resources": {
        # -------------- Full gencode39 annotation -------------------------------
        "full_gencode_gff": {
            "uri": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/gencode.v39.annotation.gff3.gz",
            "filename": "gencode_39.gff3.gz",
            "recreate": False
        },
        # -------------- GRCh38 chr20 -------------------------------
        "grch38_chr20": {
            "uri": "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/chromosomes/chr20.fa.gz",
            "filename": "grch38_chr20.fa.gz",
            "recreate": False
        },        
        # -------------- GRCh38 mappability scores -------------------------------
        "grch38_umap": {
            "uri": "https://bismap.hoffmanlab.org/raw/hg38/k24.umap.bed.gz",
            "filename": "GRCh38.k24.umap.bedgraph.gz",
            "tabix_options": "-p bed -S 1",
            "recreate": False
        }
    }
}
display(f'Downloading test data files to {outdir}')
for resname in large_test_resources['resources']:
    try:
        download_bgzip_slice(large_test_resources, resname, view_tempdir=False)
    except Exception:
        display(traceback.format_exc())
        display(f"Error creating resource {resname}. Some tests may not work...")
display("All done.")

In [None]:
# Build subset of human transcriptome (all protein coding genes on chr20, Ensembl canonical only)
t=pg.Transcriptome(
    genome_fa=pg.get_resource("grch38_chr20", conf=large_test_resources) ,
    annotation_gff=pg.get_resource("full_gencode_gff", conf=large_test_resources),
    annotation_flavour='gencode',    
    copied_fields=['gene_type', 'tag'],
    load_sequence_data=False,
    feature_filter=pg.TranscriptFilter().
            include_chromosomes({'chr20'}).
            include_gene_types({'protein_coding'}).
            include_tags({'Ensembl_canonical', None})
)
display(t)
# prove that filtering worked
assert {g.chromosome for g in t.genes} == {'chr20'}
assert {g.gene_type for g in t.genes} == {'protein_coding'}
assert all({tx.tag is None or 'Ensembl_canonical' in tx.tag for tx in t.transcripts})

Now, we annotate all genes with a numpy array containing the respective mappability values. 
This enables efficient slicing of child-intervals (e.g., transcripts, exons).


In [None]:
#%%timeit -n 1 -r 1
import numpy as np
# Example code that calculates a numpy array containing mappability values for each genomic position of the passed genomic feature (anno).
# The respective values are then saved (pickled) to the given output file
# if the output file already exists, values will be loaded from there istead of recalculating them.

def anno_scores(item, label='mappability'):
    """ 
        Callback method for creating the mappability array by slicing all overlapping intervals.
        loc: genomic interval of the feature that is annotated
        anno: the transcriptome anno dict for this feature (so you can also access any other already existing annotations for this feature)
        scores: list of (loc, score) tuples containing all overlapping locations (loc) and their scores as read from the bedgraph file.
    """
    loc, (anno, mapints) = item
    anno[label]=np.zeros(len(loc)) # create empty array
    for sloc,mapint in mapints:
        anno[label][range(max(loc.start, sloc.start)-loc.start, min(loc.end,sloc.end)+1-loc.start)]=mapint.score # update scores

# to speed things up, we could cache the transcriptome annotations in a pickle file and 
# reload from there if this file exists:
# pkfile=Path(large_test_resources['outdir']) / 'gencode.v39.k24.umap.genes.chr20.pk' 
# if os.path.isfile(pkfile): 
#    t.load_annotations(pkfile) # load from pickle file if existing; use update=True to not lose any previously annotated values 
# ...
# t.save_annotations(pkfile, keys={'mappability'}) # save only the mappability values to disk.

#annotate genes with mappability scores
t.annotate(iterators=pg.BedIterator(pg.get_resource("grch38_umap", conf=large_test_resources)), 
           region=roi,
           fun_anno=anno_scores,
           feature_types=['gene'])

# show an estimate of the memory consumption (for the whole genome this is ~10GB so it might make sense 
# to calculate mean/median mappability per annotation instead of keeping all values)
print(f"Memory size of the create numpy arrays: {sum([a['mappability'].nbytes for a in t.anno.values() if 'mappability' in a]) / (1000*1000*1000) } GB")

Now, let's query some data. First, we create a list of the 10 genes with the lowest mean mappability:

In [None]:
low_map_genes = sorted({g.gene_name:np.mean(g.mappability) for g in t.genes}.items(), key=lambda x: x[1])[:10]
display(low_map_genes)

Now, let's plot the values for one of those genes:
* we select the 1st transcript of the 1st gene in that list. 
* we slice the respective mappability values from the gene object via 'slice_from_parent'
* we summarize the data by calculating 100 tiles that are assigned the mean mappability (yellow line)
* we also plot a rolling average of the values (blue line). note that there are edge effects.
* we additionally highlight the exons of this gene (red) and plot the rank.

In [None]:
tx = t.gene['GGTLC1'].transcript[0]
map = tx.get('mappability', slice_from_parent=True)
tiled_mean_map = np.concatenate([np.array([np.mean(x)]*len(x)).flatten() for x in 
                                 np.array_split(map, 100)])
roll_mean_map = np.convolve(map, np.ones(1000), mode='same')/1000
assert len(tiled_mean_map)==len(tx) and len(roll_mean_map)==len(tx)
plt.rcParams["figure.figsize"] = (20,3)
plt.plot(tiled_mean_map, 'y')
plt.plot(roll_mean_map, 'b-')
plt.title(f"{tx.feature_id} ({tx.strand})")
plt.ylim(0,1)
for ex in sorted(tx.exon): # sort by coordinates
    plt.axvspan(ex.start-tx.start, ex.end-tx.start, color='red', alpha=0.1)
    plt.text(ex.start+(ex.end-ex.start)/2-tx.start, 0.2, ex.rnk)
    display(f"exon {ex.rnk}, len {len(ex)}: mean mappability: {np.mean(ex.get('mappability', slice_from_parent=True))}")

So, exon 5 has the lowest mean mappability of this gene. 
Let's compare exon and intron mappability for all genes.

In [None]:
# NOTE that the bedgraph file does not contain data for chrM and chrY (see tabix -l <file>), 
# so for these we use the default value [] (relevant only for whole-genome analyses)
exon_map = np.concatenate([ex.get('mappability', slice_from_parent=True, default_value=[]) \
                           for ex,_ in t.iterator(feature_types='exon')])
intron_map = np.concatenate([ex.get('mappability', slice_from_parent=True, default_value=[]) \
                           for ex,_ in t.iterator(feature_types='intron')])
print(f"exon mean map: {np.mean(exon_map)}, intron mean map: {np.mean(intron_map)}") 

And now, let's plot the mean mappability per feature for all exons and introns.
Introns have lower mappability.

In [None]:
exon_mean_map = {ex:np.mean(ex.get('mappability', default_value=[], slice_from_parent=True)) \
                 for ex,dat in t.iterator(feature_types='exon')}
intron_mean_map = {ex:np.mean(ex.get('mappability', default_value=[], slice_from_parent=True)) \
                 for ex,dat in t.iterator(feature_types='intron')}
_ = sns.boxplot(data={'exons':exon_mean_map.values(), 'introns':intron_mean_map.values()} )

How many exons have zero mappability? What are the respective genes?

In [None]:
# get all exons with 0 mappability 
zero_map_ex=[ex for ex,mean_map in exon_mean_map.items() if mean_map==0]
# now show 10 random genes that contain at least 1 exon with zero mappability
display(f"There are {len(zero_map_ex)}/{len(exon_mean_map)} exons with 0 mappability.")
display(f"Here are the respective gene names: { list({ex.parent.parent.gene_name for ex in zero_map_ex}) }")

Let's plot all transcripts of this gene (four actually passed the 'Ensembl_canonical' filtering) and
check. Indeed, exon 3 of ENST00000472140.5 has zero mappability...

In [None]:
g = t.gene['CSTL1']
for idx,tx in enumerate(g.transcript):
    map = tx.get('mappability', slice_from_parent=True)
    tiled_mean_map = np.concatenate([np.array([np.mean(x)]*len(x)).flatten() for x in 
                                     np.array_split(map, 100)])
    roll_mean_map = np.convolve(map, np.ones(1000), mode='same')/1000
    assert len(tiled_mean_map)==len(tx) and len(roll_mean_map)==len(tx)
    plt.rcParams["figure.figsize"] = (20,3)
    plt.figure(idx)
    plt.plot(tiled_mean_map, 'y')
    plt.plot(roll_mean_map, 'b-')
    plt.title(f"{tx.feature_id} ({tx.strand})")
    plt.ylim(0,1)
    for ex in sorted(tx.exon): # sort by coordinates
        plt.axvspan(ex.start-tx.start, ex.end-tx.start, color='red', alpha=0.1)
        plt.text(ex.start+(ex.end-ex.start)/2-tx.start, 0.2, ex.rnk)
        display(f"{tx.feature_id} exon {ex.rnk}, len {len(ex)}: mean mappability: {np.mean(ex.get('mappability', slice_from_parent=True))}")


Finally, lets plot the distribution of mappability values per chromosome as boxplots.

In [None]:
# calculate mean mappability per gene and chromosome 
mean_map=[[np.mean(g.get('mappability',[0])) for g in t.genes if g.chromosome==c] for c in t.merged_refdict]
_=plt.boxplot(mean_map)
_=plt.xticks([x+1 for x in range(len(t.merged_refdict))], list(t.merged_refdict.keys()), rotation=90)
_=plt.suptitle("Gene region mappability per chromosome")