## Tutorial: Read mismatch analysis
In this tutorial, we showcase rnalib's ReadIterator's `report_mismatches` feature which reports reference/read mismatches (possibly filtered by minimum basecalling quality). Briefly, we do the following:

* iterate over alignment and report mismatches, so returned item.data is a tuple (read, mismatches)
* ref/alt alleles are revcomped if read is reversed (strand specific)
* count and plot results

In [None]:
# set path and load rnalib
import os, pathlib, platform
rnalib_SRC=pathlib.Path('/Users/niko/projects/rnalib/') 
os.chdir(rnalib_SRC)
# install libraries. Recommended to run in a venv here!
#!{sys.executable} -m pip install -r requirements.txt 
display(f"Running rnalib on python {platform.python_version()}. Using rnalib code from {rnalib_SRC}")
# load rnalib
import rnalib as pg
from rnalib import gi, SEP
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np

In [None]:
# Calculate mismatch profile and plot. 
profile=Counter()
for l,(r,mm) in pg.ReadIterator(pg.get_resource('small_example_bam'),report_mismatches=True, min_base_quality=10):
    is_rev = not r.is_reverse if r.is_read2 else r.is_reverse # properly deal with PE data
    for (readpos, genpos, ref, alt) in mm: # reported mismatches contain position in read, genomic position, reference and alternate allele
        if is_rev: # correct for read strand
            ref,alt=pg.reverse_complement(ref),pg.reverse_complement(alt)
        profile[f"{ref}/{alt}"]+=1
# plot the data
labels, values = zip(*sorted(profile.items()))
plt.bar(range(len(labels)), values, 0.8)
plt.xticks(range(len(labels)), labels)
plt.suptitle("Mismatch distribution")
plt.show()

Using this feature, we  can  also easily count mismatches of a certain type (e.g., T/C mismatches as found in a SLAMseq dataset)

In [None]:
# count T/C (A/G) mismatches per read in a BAM file, filtering for minimum base quality
tc_conv={}
for l,(r,mm) in pg.ReadIterator(pg.get_resource('small_example_bam'),report_mismatches=True, min_base_quality=10):
    if len(mm)>0: # at least 1 mismatch - just to speed things up.
        is_rev = not r.is_reverse if r.is_read2 else r.is_reverse
        refc = "A" if is_rev else "T"
        altc = "G" if is_rev else "C"
        mm_tc=[(off, pos1, ref, alt) for off, pos1, ref, alt in mm if ref==refc and alt==altc] # get only T/C or A/G mismatches (depending on read strand)
        if len(mm_tc) > 0: # at least 1 T/C or A/G mm
            tc_conv[r.query_name, not r.is_read2]=mm_tc

# show the first 10 reads
display('10 reads and their mismatches', {k:tc_conv[k] for k in list(tc_conv.keys())[:10]}, SEP)

# count reads with more than one T/C conversion
display('Number of reads with more than one T/C conversion', len({k:tc_conv[k] for k in list(tc_conv.keys()) if len(tc_conv[k])>1}),SEP)

# show MM of one such read
display('Example read with 2 mismatches:', tc_conv['HWI-ST466_135068617:8:2316:4251:54002', False],  SEP)

Now, we slightly extend this example by

* counting only T/C (A/G on reverse strand) mismatches per chromosome
* count also the number of unconverted (i.e., ref==T) positions
* Create a pandas dataframe with these counts and filter rows for minimum couts
* Show a filtered dataset of entries with 20-22 convertible positions, the number of converted positions and the number of such reads  found

In [None]:
# Count T/C (A/G) mismatches per chromosome.
bam_file=pg.get_resource('small_example_bam')

# Count the mismatches; 
profile=Counter()
with pg.ReadIterator(bam_file,report_mismatches=True, min_base_quality=10) as it:
    # calculate the number of reads from the BAM index
    n_reads=sum([x.total for x in it.file.get_index_statistics()])
    for l,(r,mm) in tqdm(it, total=n_reads):
        is_rev = not r.is_reverse if r.is_read2 else r.is_reverse
        refc = "A" if is_rev else "T"
        altc = "G" if is_rev else "C"
        for (readpos, genpos, ref, alt) in mm:
            n_ref = r.query_sequence.count(refc)
            mm_tc=[(off, pos1, ref, alt) for off, pos1, ref, alt in mm if ref==refc and alt==altc]
            profile[r.reference_name,refc,altc,n_ref,len(mm_tc)]+=1
    # show stats
    display(it.stats)

# filter for min number of counts
profile=pd.DataFrame([list(k)+[v] for k,v in profile.items()], columns=['chromosome','ref','alt', 'convertible','converted', 'count'])
profile=profile[profile['count']>10]

# show filtered data, grouped by strand and convertible positions
fil=profile.query('20 <= convertible <= 22').sort_values(['convertible', 'converted'])
fil['conv']=fil['ref']+"/"+fil['alt']
grouped=fil.groupby(['conv', 'convertible'])
for i, key in enumerate(grouped.groups.keys()):
    dat=grouped.get_group(key).set_index('conv')
    display(dat)

TODO count converted reads per transcript/exon.