In [1]:
PYGENLIB_SRC='/Users/niko.popitsch/git/pygenlib/'
import os; os.chdir(PYGENLIB_SRC)
from pygenlib.utils import *
from pygenlib.iterators import *
from pygenlib.genemodel import *
!pwd



/Users/niko.popitsch/git/pygenlib


# Genemodel examples

In [2]:
# Create a transcriptome containing dmel chromosome 2L transcripts from a flybase annotation
config = {
        'genome_fa': 'testdata/dmel_r6.36.fa.gz',
        'annotation_gff': 'testdata/flybase.dmel-all-r6.51.sorted.gtf.gz',
        'annotation_flavour': 'flybase',
        'transcript_filter': {
            'included_chrom': ['2L']
        },
        'load_sequences': True
    }
t = Transcriptome(config)

WARN: Input list have differing order of shared elements ['211000022278279', '211000022278436', '211000022278449', '211000022278760', '211000022279165', '211000022279188', '211000022279264', '211000022279392', '211000022279681', '211000022280328', '211000022280341', '211000022280347', '211000022280481', '211000022280494', '211000022280703', '2L', '2R', '3L', '3R', '4', 'Unmapped_Scaffold_8_D1580_D1567', 'X', 'Y', 'mitochondrion_genome', 'rDNA']


Building transcriptome (Filtered (0 tags, 0 tids, 0 genetypes, 1 chroms, 0 regions).): 100%|█████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.31s/it]
Load sequence: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3515/3515 [00:00<00:00, 17209.15it/s]
Build interval tree: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3515/3515 [00:00<00:00, 38929.26it/s]


In [3]:
# show some data
display(f"Built {t}")
display(f"Transcript types: {Counter([tx.transcript_type for tx in t.transcripts])}")
display(f"Transcripts of gene 'cold': {list(tx.transcript_id for tx in t.gene_name['cold'].transcript)}.")
display(f"So, the gene of transcript 'FBtr0310022' is {t.transcript['FBtr0310022'].parent.name}")
display(f"Genes of all transcripts that overlap with gene 'cold': {set(tx.parent.name for tx in t.query(t.gene_name['cold'], 'transcript'))}") # see 2L:574291-575734


'Built Transcriptome with 3515 genes and 6779 tx'

"Transcript types: Counter({'mRNA': 5711, 'ncRNA': 748, 'miRNA': 103, 'pre_miRNA': 56, 'pseudogene': 54, 'snoRNA': 50, 'tRNA': 41, 'snRNA': 16})"

"Transcripts of gene 'cold': ['FBtr0310022', 'FBtr0078069']."

"So, the gene of transcript 'FBtr0310022' is cold"

"Genes of all transcripts that overlap with gene 'cold': {'Ptth', 'cold'}"

In [4]:
# get all gene names and show the first 10
[g.name for g in t.genes][:10]

['CR11023',
 'l(2)gl',
 'Ir21a',
 'asRNA:CR43609',
 'Cda5',
 'lncRNA:CR46254',
 'lncRNA:CR45339',
 'lncRNA:CR45340',
 'dbr',
 'asRNA:CR44987']

In [5]:
# get number of exons for all cold transcripts
{tx.transcript_id:len(tx.exon) for tx in t.gene_name['cold'].transcript}

{'FBtr0310022': 4, 'FBtr0078069': 3}

In [6]:
# list genes where one of the exons overlaps with the given region
{ex.parent.parent.name for ex in t.query(gi.from_str("2L:20000-30000"), 'exon')}

{'Cda5', 'Ir21a', 'asRNA:CR43609', 'l(2)gl'}

In [7]:
# list all gene names where the kmer is found in one of the (spliced) transcripts
kmer="ACTGAGCTA"
{ tx.parent.name for tx in t.transcripts if kmer in t.get_spliced_seq(tx) }

{'CG31869',
 'CG4891',
 'CG5367',
 'CG7806',
 'Ca-beta',
 'LanB1',
 'Or23a',
 'Sema1a',
 'SoYb',
 'aph-1',
 'emb',
 'l(2)05287'}

In [8]:
# list all gene names where the kmer is found in one of its transcripts introns considering only long (>10kb) introns
kmer="ACTGAGCTA"
{ tx.parent.name for tx in t.transcripts for intron in tx.intron if len(intron)>10000 and kmer in t.get_sequence(intron) }

{'CG31869',
 'CG42238',
 'CG42784',
 'CG46308',
 'Dyrk2',
 'Ggamma30A',
 'MRP',
 'Nlg2',
 'RapGAP1',
 'Rtnl1',
 'Trim9',
 'aop',
 'jp',
 'rdo',
 'sick',
 'tkv',
 'wwk'}

In [9]:
# list all genes and their up/downstream genes within a given max distance (~63ms). Show only first 5 entries
def get_name(x):
    return None if x is None else x.name
[(get_name(x),get_name(y),get_name(z)) for x, y, z in t.gene_triples(max_dist=10000)][:10]

[(None, 'CR11023', 'l(2)gl'),
 ('CR11023', 'l(2)gl', 'Ir21a'),
 ('l(2)gl', 'Ir21a', 'asRNA:CR43609'),
 ('Ir21a', 'asRNA:CR43609', 'Cda5'),
 ('asRNA:CR43609', 'Cda5', 'lncRNA:CR46254'),
 ('Cda5', 'lncRNA:CR46254', None),
 (None, 'lncRNA:CR45339', 'lncRNA:CR45340'),
 ('lncRNA:CR45339', 'lncRNA:CR45340', 'dbr'),
 ('lncRNA:CR45340', 'dbr', 'asRNA:CR44987'),
 ('dbr', 'asRNA:CR44987', 'galectin')]

In [10]:
# report a coordinate-sorted list of genes in a 10kb window around cold:
g=t.gene_name['cold']
list(t.query(gi(g.chromosome, g.start-10000, g.end+10000), 'gene'))

[gene@2L:564163-566472,
 gene@2L:566366-568121,
 gene@2L:568340-572907,
 gene@2L:573033-574264,
 gene@2L:574291-575734,
 gene@2L:575711-576896,
 gene@2L:577486-579549,
 gene@2L:583540-594685]

In [15]:
# dict of genes and their unique 200bp 3'UTR intervals per tx (multiple intervals if spliced); show for 5 genes only
# note that for CG11374/FBtr0306541 2 intervals are reported as the 200bp length requires splicing
{g.name: {tx.transcript_id:calc_3end(tx) for tx in g.transcript} for g in t.genes[10:15]}

{'galectin': {'FBtr0306540': [2L:76012-76211 (+)],
  'FBtr0078101': [2L:76012-76211 (+)],
  'FBtr0302164': [2L:76012-76211 (+)],
  'FBtr0331680': [2L:76012-76211 (+)],
  'FBtr0301733': [2L:76012-76211 (+)]},
 'CG11374': {'FBtr0306541': [2L:77642-77783 (+), 2L:77526-77583 (+)]},
 'net': {'FBtr0330637': [2L:82984-83183 (-)],
  'FBtr0290323': [2L:82421-82620 (-)]},
 'Zir': {'FBtr0078103': [2L:101887-102086 (+)]},
 'Creld': {'FBtr0330636': [2L:103943-104142 (+)],
  'FBtr0078104': [2L:103943-104142 (+)]}}

# Genomic Iterator examples

In [16]:
# FastaIterator support window/step size and padding
fasta_file='testdata/ACTB+SOX2.fa.gz'

# get 5mer sliding windows (step sie 2) with padding from a GRCh38 region around ACTB; show first and last 5 results
kmers=[s for _,s in FastaIterator(fasta_file, 'chr7', None, None, width=5, step=2, padding=True).take()]
display(kmers[:5]+["..."]+kmers[-5:])

['NNTTG',
 'TTGTG',
 'GTGCC',
 'GCCAT',
 'CATTA',
 '...',
 'TTGTA',
 'GTATT',
 'ATTTT',
 'TTTTN',
 'TTNNN']

In [17]:
# iterate a GFF3 file and collect stats of contained feature_type annotations
Counter([info['feature_type'] for _,info in GFF3Iterator('testdata/gencode.v39.ACTB+SOX2.gff3.gz')])

Counter({'exon': 106,
         'CDS': 60,
         'five_prime_UTR': 33,
         'transcript': 24,
         'three_prime_UTR': 20,
         'start_codon': 17,
         'stop_codon': 13,
         'gene': 2})

In [18]:
# count reads per chromosome in a BAM using different filters
stats={x:Counter() for x in ['all', 'def', 'mq20', 'tag']}
with open_file_obj('testdata/small_example.bam') as bam:
    for chrom in get_reference_dict(bam):
        with ReadIterator(bam, chrom, flag_filter=0) as it: # all: no filtering
            it.take()
            stats['all'].update(it.stats)
        with ReadIterator(bam, chrom) as it: # def: default flag filter (as in IGV)
            it.take()
            stats['def'].update(it.stats)
        with ReadIterator(bam, chrom, min_mapping_quality=20) as it: # mq20: default flag filter, mapping quality >= 20
            it.take()
            stats['mq20'].update(it.stats)
        with ReadIterator(bam, chrom, tag_filters=[TagFilter('MD', ['100'])]) as it: # tag: default flag filter, MD tag must be '100'
            it.take()
            stats['tag'].update(it.stats)
display(stats)

{'all': Counter({('n_reads', '1'): 31678}),
 'def': Counter({('n_reads', '1'): 21932, ('n_fil_flag', '1'): 9746}),
 'mq20': Counter({('n_reads', '1'): 21626,
          ('n_fil_flag', '1'): 9746,
          ('n_fil_mq', '1'): 306}),
 'tag': Counter({('n_fil_tag', '1'): 14544,
          ('n_fil_flag', '1'): 9746,
          ('n_reads', '1'): 7388})}

In [19]:
# count t/c mismatches per read in a BAM file, filtering for minimum base quality
tc_conv={}
for l,(r,mm) in ReadIterator('testdata/small_example.bam',report_mismatches=True, min_base_quality=10):
    if len(mm)>0:
        is_rev = not r.is_reverse if r.is_read2 else r.is_reverse
        refc = "A" if is_rev else "T"
        altc = "G" if is_rev else "C"
        mm_tc=[(off, pos1, ref, alt) for off, pos1, ref, alt in mm if ref==refc and alt==altc]
        if len(mm_tc) > 0:
            tc_conv[r.query_name, not r.is_read2]=mm_tc

# show the first 10 reads
display({k:tc_conv[k] for k in list(tc_conv.keys())[:10]})

# count reads with more than one T/C conversion
display(len({k:tc_conv[k] for k in list(tc_conv.keys()) if len(tc_conv[k])>1}))

# show MM of one such read
display(tc_conv['HWI-ST466_135068617:8:2316:4251:54002', False])

{('HWI-ST466_135068617:8:1104:18372:89212', False): [(2, 22379204, 'A', 'G')],
 ('HWI-ST466_135068617:8:2114:8986:54547', False): [(98, 22404991, 'T', 'C')],
 ('HWI-ST466_135068617:8:2214:18036:44964', False): [(86, 22404981, 'A', 'G')],
 ('HWI-ST466_135068617:8:2301:15221:88708', True): [(71, 22404968, 'A', 'G')],
 ('HWI-ST466_135068617:8:1107:12866:40944', False): [(72, 22404971, 'A', 'G')],
 ('HWI-ST466_135068617:8:2101:20690:68324', False): [(78, 22404984, 'A', 'G')],
 ('HWI-ST466_135068617:8:1102:11736:16410', True): [(53, 22404966, 'T', 'C')],
 ('HWI-ST466_135068617:8:1103:20561:79535', True): [(52, 22404966, 'T', 'C')],
 ('HWI-ST466_135068617:8:2308:21226:40688', False): [(25, 22404940, 'A', 'G')],
 ('HWI-ST466_135068617:8:1303:15014:31681', False): [(2, 22379925, 'A', 'G')]}

25

[(2, 22443997, 'A', 'G'), (5, 22444000, 'A', 'G')]

In [20]:
# list the genotypes of annotated SNPs for 3 samples that overlap a region in the flybase GTF
gff_file='testdata/flybase.dmel-all-r6.51.sorted.gtf.gz'
vcf_file='testdata/dmelanogaster_6_exported_20230523.vcf.gz'

with AnnotationIterator(GFF3Iterator(gff_file, '2L', 574299, 575733), 
                        VcfIterator(vcf_file, samples=['DGRP-208', 'DGRP-325', 'DGRP-721'])) as it:
    for loc, (v1,v2) in it.take()[:5]:
        display(f"{loc} {v1[0][1]['gene_id']}  {v2}")

Iterating refdict: Refset (size: 25): dict_keys(['211000022278279', '211000022278436', '211000022278449', '211000022278760', '211000022279165', '211000022279188', '211000022279264', '211000022279392', '211000022279681', '211000022280328', '211000022280341', '211000022280347', '211000022280481', '211000022280494', '211000022280703', '2L', '2R', '3L', '3R', '4', 'Unmapped_Scaffold_8_D1580_D1567', 'X', 'Y', 'mitochondrion_genome', 'rDNA']), dict_values([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]) name: References from TABIX file b'testdata/flybase.dmel-all-r6.51.sorted.gtf.gz' 


'2L:574291-574355 (+) FBgn0031268  []'

'2L:574291-575734 (+) FBgn0031268  [(2L:574362-574362, 2L:574362A>G), (2L:574390-574390, 2L:574390T>C), (2L:574410-574410, 2L:574410G>T), (2L:574421-574421, 2L:574421T>A), (2L:574429-574429, 2L:574429C>T), (2L:574500-574500, 2L:574500C>A), (2L:574518-574518, 2L:574518G>A), (2L:574707-574707, 2L:574707A>G), (2L:574708-574708, 2L:574708T>A), (2L:574819-574819, 2L:574819C>T), (2L:574825-574825, 2L:574825G>A), (2L:574834-574834, 2L:574834G>C), (2L:574960-574960, 2L:574960G>A), (2L:575011-575011, 2L:575011G>T), (2L:575086-575086, 2L:575086C>G), (2L:575101-575101, 2L:575101G>A), (2L:575130-575130, 2L:575130G>C), (2L:575333-575333, 2L:575333T>A), (2L:575342-575342, 2L:575342A>C), (2L:575454-575454, 2L:575454G>T), (2L:575575-575575, 2L:575575T>G), (2L:575596-575596, 2L:575596A>G), (2L:575645-575645, 2L:575645C>A), (2L:575650-575650, 2L:575650G>A), (2L:575690-575690, 2L:575690T>C)]'

'2L:574291-575734 (+) FBgn0031268  [(2L:574362-574362, 2L:574362A>G), (2L:574390-574390, 2L:574390T>C), (2L:574410-574410, 2L:574410G>T), (2L:574421-574421, 2L:574421T>A), (2L:574429-574429, 2L:574429C>T), (2L:574500-574500, 2L:574500C>A), (2L:574518-574518, 2L:574518G>A), (2L:574707-574707, 2L:574707A>G), (2L:574708-574708, 2L:574708T>A), (2L:574819-574819, 2L:574819C>T), (2L:574825-574825, 2L:574825G>A), (2L:574834-574834, 2L:574834G>C), (2L:574960-574960, 2L:574960G>A), (2L:575011-575011, 2L:575011G>T), (2L:575086-575086, 2L:575086C>G), (2L:575101-575101, 2L:575101G>A), (2L:575130-575130, 2L:575130G>C), (2L:575333-575333, 2L:575333T>A), (2L:575342-575342, 2L:575342A>C), (2L:575454-575454, 2L:575454G>T), (2L:575575-575575, 2L:575575T>G), (2L:575596-575596, 2L:575596A>G), (2L:575645-575645, 2L:575645C>A), (2L:575650-575650, 2L:575650G>A), (2L:575690-575690, 2L:575690T>C)]'

'2L:574291-574355 (+) FBgn0031268  []'

'2L:574508-575734 (+) FBgn0031268  []'

In [21]:
# Annotated intervals in a BED file with sum of scores from a bedgraph file; supports bedgraph intervals >1bp: score contribution
# is calculated from the interval overlap
bed_file = 'testdata/test.bed.gz'
bedg_file = 'testdata/test.bedgraph.gz'

# overlap with bedgraph file, calculate overlap and sum scores
# NOTE bedgraph file contains interval (1:7-10, 0.3)
with AnnotationIterator(BedIterator(bed_file), BedGraphIterator(bedg_file)) as it:
    display([(i1[0][1].name, sum([x[1]*l.overlap(x[0]) for x in i2])) for l,(i1,i2) in it])

Iterating refdict: Refset (size: 2): dict_keys(['1', '2']), dict_values([None, None]) name: References from TABIX file b'testdata/test.bed.gz' 


[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?
[W::tbx_parse1] Coordinate <= 0 detected. Did you forget to use the -0 option?


[('int1', 1.408), ('int2', 0.3), ('int3', 0)]

## Pileup performance comparison

Here we compare our pileup method to pysam (which adds a lot of boilerplate + additional checks) and reach a 10X performance increase.

In [22]:
#chrom,start,stop='1',22418229,22418268
#chrom,start,stop='1',22377202,22429853
#chrom,start,stop='1',22378966,22379711
reg=gi('1',22377202,22429853)

In [23]:
%%timeit -n 1 -r 1
ac1=Counter()
with open_file_obj('testdata/small_example.bam') as bam:
    for l,v in FastPileupIterator(bam, reg.chromosome, range(reg.start, reg.end)):
        ac1[l.chromosome, l.start]=v
#display(ac1)

1.75 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
ac2=Counter()
with open_file_obj('testdata/small_example.bam') as bam:
    for pu in bam.pileup(contig=reg.chromosome, start=reg.start-1, stop=reg.end-1, flag_filter=DEFAULT_FLAG_FILTER, 
                         truncate=True,mark_ends=True,add_indels=True,min_base_quality=0,min_mapping_quality=0,
                         ignore_overlaps=False, ignore_orphans=False,
                         max_depth=100000):
        pos=(pu.reference_name, pu.reference_pos+1)
        ac2[pos]=Counter()
        for r in pu.pileups:
            if r.is_refskip:
                continue
            elif r.is_del:
                ac2[pos][None]+=1
            else:
                ac2[pos][r.alignment.query_sequence[r.query_position]]+=1
#display(ac2)

Finally, lets check whether results match (remove %%timeit first)

In [None]:
for pos in ac1:
    if ac2[pos]==0:
        ac2[pos]=Counter()
if ac1==ac2:
    print("ALL GOOD")
else:
    for pos in ac1:
        if ac1[pos]!=ac2[pos]:
            print('err', pos, ac1[pos], ac2[pos])

# Transcriptome tests

In [None]:
config={
    'genome_fa': '/Volumes/groups/ameres/Niko/ref/genomes/GRCh38/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set.fna',
    'annotation_gff': '/Volumes/groups/ameres/Niko/ref/genomes/GRCh38/annotation/gencode.v39.annotation.sorted.gff3.gz',
    'annotation_flavour': 'gencode',
    'transcript_filter': {
        'included_tags': ['Ensembl_canonical'],
        'included_genetypes': ['protein_coding']
    },
    'copied_fields': ['gene_type'],
    'drop_empty_genes': False,
    'load_sequences': False
}

In [None]:
# Build subset of human transcriptome (all protein coding genes, Ensembl canonical only), takes 5-6min
t=Transcriptome(config)

In [None]:
# We can also save/load annotated transcriptomes dusing dill (pickle). Saving is slow but loading is fast!
# save/load takes ~25min, ~3GB
# t.save('/Users/niko.popitsch/git/pygenlib/testdata/gencode.v39.pk')
t=Transcriptome.load('/Users/niko.popitsch/git/pygenlib/testdata/gencode.v39.pk') # load is fast

In [None]:
# get all gene names and show the first 10
[g.name for g in t.genes.values()][:10]

In [None]:
# get number of exons for all ACTB transcripts
{tx.tid:len(tx.exons) for tx in t.get_gene('ACTB').transcripts.values()}

In [None]:
# Gene name of tx ENST00000646664.1
t.transcripts['ENST00000646664.1'].parent.name

In [None]:
# Get all transcript ids where an exon overlaps chr7:5529193
{e.parent.tid for e in t.query(gi('chr7', 5529193, 5529193), Exon)}

In [None]:
# list all gene names where the kmer is found in one of the transcripts
kmer="ACTGACTGACTG"
{ tx.parent.name for tx in t.transcripts.values() if kmer in tx.get_spliced_seq() }

In [None]:
# list transcripts that overlap with the given region
[ex.parent for ex in t.query(gi.from_str("chr2:241073384-241073423"), Exon)]

In [None]:
%%timeit
# list all genes and their up/downstream genes within a given max distance (~63ms)
def get_name(x):
    return None if x is None else x.name
[(get_name(x),get_name(y),get_name(z)) for x, y, z in t.gene_triples(max_dist=10000)]

In [None]:
%%timeit
# iterate all genes and query neighbouring genes +/- 10k bases (1-2sec)
for g in t.genes.values():
    list([(x.location(),x.name) for x in sorted(t.query(gi(g.chromosome, g.start-100000, g.end+100000), Gene))])

In [None]:
# get the transcript ids of all exons that overlap with a given genomic region
{e.parent.tid for e in t.query(gi('chr13', 32336891, 32336891), Exon)} 

# Genomic interval implementation and reference dictionaries

In [25]:
# Genomic intervals (gi) in pygenlib are inclusive and 1-based. Points are represented by intervals with same start+stop coordinate.
# GIs are implemented as frozen(immutable) dataclasses and can be used, e.g., as keys in a dict.
# They can be instantiated by passing chrom/start/stop coordinates or can be parsed form a string. 
# Intervals can be stranded.
# Using None for each component of the coordinates is allowed to represent unbounded intervals
locs=[gi('chr2', 1, 100), 
      gi.from_str('chr1:5-500 (+)'), 
      gi.from_str('chr3:50-120 (-)'), 
      gi.from_str('chr1:10-20 (-)'), 
      gi(None, None, 1000, '-'), 
      gi(None, 10, 1000, '-')]
display(sorted(locs))
# Note that chromosome group intervals and the order of intervals from different groups (chromosomes) is left undefined.
display(f"is chr2:1-1>chr1:1-1 defined?: {gi('chr2', 1, 1) > gi('chr1', 1, 1)}")

# To sort also by chromosome, you can use a reference dict which defined the chromosome order:
refdict=ReferenceDict({'chr1':None, 'chr2':None, 'chr3':None}, 'test', None)
display(refdict)
display(sorted(locs, key=lambda x: (refdict.index(x.chromosome), x)))
# Unbounded intervals (chromosome=None) will always be at the beginning of the list

[None:-inf-1000 (-),
 chr2:1-100,
 chr1:5-500 (+),
 chr3:50-120 (-),
 chr1:10-20 (-),
 None:10-1000 (-)]

'is chr2:1-1>chr1:1-1 defined?: None'

Refset (size: 3): dict_keys(['chr1', 'chr2', 'chr3']), dict_values([None, None, None]) name: test 

[None:-inf-1000 (-),
 None:10-1000 (-),
 chr1:5-500 (+),
 chr1:10-20 (-),
 chr2:1-100,
 chr3:50-120 (-)]

# Utility functions

In [26]:
# Gene symbols are updated regularly and mapping between different id schemas is cumbersome
# pygenlib implements an interface to MyGeneInfo for easy translation between ids and symbols 
# Example: we pass a mixed list of Ensembl and Entrez ids for mouse and human  actin beta:
geneid2symbol(['ENSMUSG00000029580', 60])

INFO:biothings.client:querying 1-2...
INFO:biothings.client:done.


{'60': ACTB (actin beta, tax: 9606),
 'ENSMUSG00000029580': Actb (actin, beta, tax: 10090)}

In [27]:
# Here we use genenames.org data to convert lists containing partially old gene symbols to their current symbol
gene_name_alias_file='/Users/niko.popitsch/Desktop/data/projects/Ameres/af2/colabfold_test/annotations/hgnc_complete_set.txt'
aliases, current_symbols = read_alias_file(gene_name_alias_file)
# now lets translate some previous names of AADAC
norm_gn('DAC', current_symbols, aliases), norm_gn('CES5A1', current_symbols, aliases)

load gene aliases: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43591/43591 [00:02<00:00, 15905.07it/s]


('AADAC', 'AADAC')

In [28]:
# We also provide a wrapper around some useful Biopython methods, e.g., for sequence alignment
refseq='AAATTTCCCACTGAAATTTCCC'
score, startpos, endpos = align_sequence('ACTGactTTTC', refseq, print_alignment=True)
print(f"We found a match from {startpos}-{endpos} ('{'AAATTTCCCACTGAAATTTCCC'[startpos:endpos]}') with normalized alignment score {score}")

---------ACTGactTTTC--
         ||||...||||  
AAATTTCCCACTGAAATTTCCC
  Score=8

We found a match from 9-20 ('ACTGAAATTTC') with normalized alignment score 0.7272727272727273


In [29]:
# And utilities to work with Nanopore data (FAST5 files)
# You can, e.g., inspect the structure of such a file:
print_fast5_tree('testdata/FAT61995_a1291c8f_5.fast5', show_attrs=False)

read_00640444-1c5c-495b-b35b-d96f7020b399 ├── Analyses
read_00640444-1c5c-495b-b35b-d96f7020b399 │   ├── Basecall_1D_000 {component=b'basecall_1d',model_type=b'flipflop',model_version_id=b'2020-09-07_rna_r9.4.1_minion_256_8f8fc47b',name=b'ONT Guppy basecalling software.',segmentation=b'Segmentation_000',time_stamp=b'2023-02-17T10:21:00Z',version=b'6.1.7+21b93d1'}
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │   ├── BaseCalled_template
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │   │   ├── Fastq {table_version=b'fastq_record_v0.1'}
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │   │   ├── Move {table_version=b'flipflop_move_table_v0.1'}
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │   │   └── Trace {offset=0.0,scale=0.003921568859368563,table_version=b'flipflop_trace_table_v0.1'}
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │   └── Summary {return_status=b'Workflow Successful'}
read_00640444-1c5c-495b-b35b-d96f7020b399 │   │       └── basecall_1d_template {block_stride=10,called_e

In [30]:
# Or check the basecalling groups in this file
get_bcgs('testdata/FAT61995_a1291c8f_5.fast5')

['Basecall_1D_000']