In [1]:
import set_working_directory

In [2]:
from cogent3.core.annotation_db import BasicAnnotationDb

anno_db = BasicAnnotationDb()
anno_db

BasicAnnotationDb(source=':memory:', total_records=0)

In [3]:
from cogent3 import load_annotations

gff_db = load_annotations(path="data/mycoplasma-genitalium.gff")
gff_db

GffAnnotationDb(source=':memory:', total_records=1169)

In [4]:
from cogent3 import load_annotations

gb_db = load_annotations(path="data/mycoplasma-genitalium.gb")
gb_db

GenbankAnnotationDb(seqid='NC_000908', source=':memory:', namer=None, total_records=1127)

In [5]:
summary = gff_db.describe
summary

Unnamed: 0,count
seqid('NC_000908.2'),1169
biotype('CDS'),521
biotype('RNase_P_RNA'),1
biotype('SRP_RNA'),1
biotype('exon'),42
biotype('gene'),546
biotype('pseudogene'),17
biotype('rRNA'),3
biotype('region'),1
biotype('tRNA'),36


In [6]:
 anno_db.add_feature(
           seqid="NC_000908",
           biotype="gene",
           name="interesting_gene",
           spans=[(1, 4)],
           strand="+",
            )
anno_db.describe

Unnamed: 0,count
seqid('NC_000908'),1
biotype('gene'),1
num_rows('user'),1


In [7]:
gff_db.add_feature(
    seqid="seq1",
    biotype="gene",
    name="interesting_gene",
    spans=[(1, 4)],
    strand="+",
)
gff_db.describe[-2:, :] # showing just last two rows

Unnamed: 0,count
num_rows('gff'),1169
num_rows('user'),1


In [8]:
mg_16s = list(
    gb_db.get_features_matching(
        name="MG_RS00775", biotype="gene", seqid="NC_000908"
    )
)
mg_16s

[{'seqid': 'NC_000908',
  'biotype': 'gene',
  'spans': [(170011, 171529)],
  'name': 'MG_RS00775',
  'on_alignment': None,
  'reversed': False}]

In [9]:
pseudogenes = list(gff_db.get_features_matching(biotype="pseudogene"))
pseudogenes[:2] # showing just the first two

[{'seqid': 'NC_000908.2',
  'biotype': 'pseudogene',
  'spans': [(85561, 86589)],
  'name': 'gene-MG_RS02910',
  'on_alignment': None,
  'reversed': False},
 {'seqid': 'NC_000908.2',
  'biotype': 'pseudogene',
  'spans': [(86784, 87549)],
  'name': 'gene-MG_RS00385',
  'on_alignment': None,
  'reversed': False}]

In [10]:
operon_cds = list(
    gff_db.get_features_matching(start=220600, end=229067, biotype="CDS")
)
operon_cds

[{'seqid': 'NC_000908.2',
  'biotype': 'CDS',
  'spans': [(220606, 221563)],
  'name': 'cds-WP_009886005.1',
  'on_alignment': None,
  'reversed': False},
 {'seqid': 'NC_000908.2',
  'biotype': 'CDS',
  'spans': [(221569, 225904)],
  'name': 'cds-WP_010869366.1',
  'on_alignment': None,
  'reversed': False},
 {'seqid': 'NC_000908.2',
  'biotype': 'CDS',
  'spans': [(225914, 229067)],
  'name': 'cds-WP_041593683.1',
  'on_alignment': None,
  'reversed': False}]

In [11]:
replication_records = list(
    gff_db.get_records_matching(attributes="replication", biotype="CDS")
)
replication_records[0] # showing just the first match

{'seqid': 'NC_000908.2',
 'source': 'Protein Homology',
 'biotype': 'CDS',
 'start': 685,
 'end': 1828,
 'score': '.',
 'strand': '+',
 'phase': '0',
 'attributes': 'ID=cds-WP_009885562.1;Parent=gene-MG_RS00005;Dbxref=Genbank:WP_009885562.1;Name=WP_009885562.1;Ontology_term=GO:0006260,GO:0003887,GO:0009360;gbkey=CDS;gene=dnaN;go_component=DNA polymerase III complex|0009360||IEA;go_function=DNA-directed DNA polymerase activity|0003887||IEA;go_process=DNA replication|0006260||IEA;inference=COORDINATES: similar to AA sequence:RefSeq:WP_010874358.1;locus_tag=MG_RS00005;product=DNA polymerase III subunit beta;protein_id=WP_009885562.1;transl_table=4',
 'comments': None,
 'spans': array([[ 685, 1828]]),
 'name': 'cds-WP_009885562.1',
 'parent_id': 'gene-MG_RS00005'}

In [12]:
gb_db.num_matches(biotype="gene")

563

In [13]:
total_genes = gb_db.count_distinct(biotype="gene", name=True)
single_copy = total_genes[total_genes.columns["count"] == 1, :]
len(single_copy)

561

In [14]:
total_genes = gff_db.num_matches(biotype="gene")
print("total genes: ", total_genes)
genes = gff_db.count_distinct(biotype="gene", name=True)
single_copy = genes[genes.columns["count"] == 1, :]
print("single copy genes: ", len(single_copy))

total genes:  547
single copy genes:  547


In [15]:
children = list(gff_db.get_feature_children(name="gene-MG_RS00035"))
children

[{'seqid': 'NC_000908.2',
  'biotype': 'CDS',
  'spans': [(9155, 9920)],
  'name': 'cds-WP_009885556.1',
  'on_alignment': None,
  'reversed': False}]

In [16]:
parents = list(gff_db.get_feature_parent(name="cds-WP_009885556.1"))
parents

[{'seqid': 'NC_000908.2',
  'biotype': 'gene',
  'spans': [(9155, 9920)],
  'name': 'gene-MG_RS00035',
  'on_alignment': None,
  'reversed': False}]

In [17]:
gff_db.compatible(anno_db)

True

In [18]:
gff_db.compatible(gb_db)

False

In [19]:
union_db = gb_db.union(anno_db)
union_db.describe[-2:, :]

Unnamed: 0,count
num_rows('gb'),1127
num_rows('user'),1


In [20]:
gff_db.update(anno_db)
gff_db.describe[-2:, :]

Unnamed: 0,count
num_rows('gff'),1169
num_rows('user'),2


In [21]:
from cogent3.core.annotation_db import GenbankAnnotationDb

new_gb_db = GenbankAnnotationDb(source="m-genitalium-database.gbdb", db=anno_db)
new_gb_db

GenbankAnnotationDb(seqid='<multiple seqids>', source='m-genitalium-database.gbdb', namer=None, total_records=1)

In [22]:
from cogent3 import make_seq

seq1 = make_seq(
    "AAGAAGAAGACCCCCAAAAAAAAAATTTTTTTTTTAAAAAGGGAACCCT",
    name="NC_000908",
    moltype="dna",
)

seq1.annotation_db = anno_db
seq1.annotation_db

BasicAnnotationDb(source=':memory:', total_records=1)

In [23]:
from cogent3 import load_seq

gb_seq = load_seq("data/mycoplasma-genitalium.gb")
gb_seq.annotation_db

GenbankAnnotationDb(seqid='NC_000908', source=':memory:', namer=None, total_records=1127)

In [24]:
gff_seq = load_seq(
    "data/mycoplasma-genitalium.fa",
    annotation_path="data/mycoplasma-genitalium.gff",
)
gff_seq.annotation_db

GffAnnotationDb(source=':memory:', total_records=0)

In [25]:
seq = load_seq(
    "data/mycoplasma-genitalium.fa",
    annotation_path="data/mycoplasma-genitalium.gff",
    label_to_name=lambda x: x.split()[0],
)
seq.annotation_db

GffAnnotationDb(source=':memory:', total_records=1169)

In [26]:
import pathlib

# clean up files

path = pathlib.Path("m-genitalium-database.gbdb")
path.unlink()