In [1]:
import pysam 
import pandas as pd
import mappy as mp

# Set up reference genomes (slow)

In [4]:
%%time
cast_path = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/references/fasta/Mus_musculus_casteij.CAST_EiJ_v1.dna.toplevel.fa"
cast_ref = mp.Aligner(cast_path)
print('done!')

done!
CPU times: user 1min 39s, sys: 15.8 s, total: 1min 55s
Wall time: 1min 58s


In [5]:
%%time
s129_path = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/references/fasta/Mus_musculus_129s1svimj.129S1_SvImJ_v1.dna.toplevel.fa"
s129_ref = mp.Aligner(s129_path)
print('done!')

done!
CPU times: user 1min 41s, sys: 14.6 s, total: 1min 56s
Wall time: 1min 59s


# Monomer Alignment

In [25]:
def align_monomer(seq: str, ref, name = None) -> pd.DataFrame:
    """
    Align one query sequence to a mappy index and return a DataFrame of hits.
    Columns match common mappy.AlignedSegment fields.
    """
    rows = []
    for hit in ref.map(seq):  # iterate alignments
        rows.append({
            "ctg": hit.ctg,                 # reference contig
            "r_st": hit.r_st,               # ref start (0-based)
            "r_en": hit.r_en,               # ref end (exclusive)
            "q_st": getattr(hit, "q_st", None),
            "q_en": getattr(hit, "q_en", None),
            "strand": "-" if getattr(hit, "strand", 1) < 0 else "+",
            "mapq": getattr(hit, "mapq", None),
            "score": getattr(hit, "score", None),
            "mlen": getattr(hit, "mlen", None),   # matched length
            "blen": getattr(hit, "blen", None),   # alignment block length
            "NM": getattr(hit, "NM", None),       # edits if available
            "is_primary": bool(getattr(hit, "is_primary", True)),
            "cigar": getattr(hit, "cigar_str", None),
            "name" : name, 
        })
    return pd.DataFrame(rows)

In [26]:
%%time 

fpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/pipeline_outputs/population/expanded/annotate/batch02.GRCm39.pe.bam"
bam = pysam.AlignmentFile(fpath)

stop = 1
count = -1

# loop through each monomoner
for read in bam:
    count += 1
    if count == stop:
        break

    # # print the read name
    # print(f"\n{read.qname=}")

    # alignment of the monomer both genomes 
    seq = read.seq
    h1 = align_monomer(seq, s129_ref, name='S129')
    h2 = align_monomer(seq, cast_ref, name='CAST')

    align = pd.concat([h1, h2])
    print(align)
    
    # compare hits (h1 vs. h2)
    # examples:
        # max alignment_score(H1, H2)
        # max alignment_length(H1, H2)
        # min alignment_mismatch(H1, H2)
        # max alignment_quality(H1, H2)
    
    # assign --> subread to: h1, h2, or unknown

    # store assignments

  ctg       r_st       r_en  q_st  q_en strand  mapq score  mlen  blen  NM  \
0  10  110227525  110227699     0   175      +    60  None   166   176  10   
0  10  109375675  109375849     0   175      +    60  None   169   176   7   

   is_primary               cigar  name  
0        True  21M1I109M1I4M1D39M  S129  
0        True  21M1I109M1I4M1D39M  CAST  
CPU times: user 3.49 ms, sys: 1.9 ms, total: 5.38 ms
Wall time: 7.56 ms
