In [1]:
%%bash

# берется из задания
export GENE="NW_004624772"

/opt/bin/datasets summary gene accession "$GENE" 2>&1 --report product | tail -1 > gene.accession.json

In [2]:
import json
import pathlib as pl

In [3]:
products = [item['product'] for item in json.load(pl.Path('gene.accession.json').open())['reports']]
products.sort(key=lambda x: x['gene_id'])

In [4]:
# по заданию выбирается произвольно
GENE_NUMBER = 11
gene = products[GENE_NUMBER]
gene['gene_id']

'101697269'

In [5]:
print(f'Всего транскриптов: {gene["transcript_count"]}; Белок-кодирующих последовательностей: {gene["protein_count"]}')

Всего транскриптов: 2; Белок-кодирующих последовательностей: 2


In [6]:
for transcript in gene['transcripts']:
    if 'protein' not in transcript or transcript['type'] != 'PROTEIN_CODING_MODEL':
        continue
    print(f'protein_id: {transcript["accession_version"]}; product: {transcript["protein"].get("name")} {transcript["protein"].get("isoform_name")}')

protein_id: XM_013075423.2; product: adenosine receptor A3 isoform X2
protein_id: XM_004853719.2; product: adenosine receptor A3 isoform X1


In [7]:
%%bash

# берется из задания
export TAXON="Streptophyta"

/opt/bin/datasets summary genome taxon "$TAXON" 2>&1 | tail -1 > taxon.summary.json

In [8]:
raw_taxon_reports = json.load(pl.Path('taxon.summary.json').open())['reports']

In [9]:
refseq, annotated = 0, 0
multiassempled_count = dict()
for report in raw_taxon_reports:
    if report['source_database'] == 'SOURCE_DATABASE_REFSEQ':
        refseq += 1
    if 'annotation_info' in report:
        annotated += 1
    taxid = report.get('organism', dict()).get('tax_id')
    if taxid is not None:
        multiassempled_count[taxid] = multiassempled_count.get(taxid, 0) + 1

print(f'Имеют записи в RefSeq: {refseq}; Имеют аннотацию: {annotated}')

Имеют записи в RefSeq: 202; Имеют аннотацию: 673


In [10]:
multiassempled = dict()
for report in raw_taxon_reports:
    taxid = report.get('organism', dict()).get('tax_id')
    if multiassempled_count.get(taxid, 0) < 2:
        continue
    if taxid not in multiassempled:
        multiassempled[taxid] = list()
    multiassempled[taxid].append(report)

In [11]:
TAXID = 14
taxids = list(multiassempled.keys())
taxids.sort()
assembles = sorted(multiassempled[taxids[TAXID]], key=lambda x: x['accession'])
print(f'Всего сборок для {assembles[0]["organism"]["organism_name"]}: {len(assembles)}')
table = [['accession','level','contig N50','contig L50','scaffold N50','scaffold L50']]
for assembly in assembles:
    table.append([
        assembly["accession"],
        assembly["assembly_info"]["assembly_level"],
        assembly["assembly_stats"].get("contig_n50", 0),
        assembly["assembly_stats"].get("contig_l50", 0),
        assembly["assembly_stats"].get("scaffold_n50", 0),
        assembly["assembly_stats"].get("scaffold_l50", 0),
    ])
table

Всего сборок для Ficus carica: 5


[['accession',
  'level',
  'contig N50',
  'contig L50',
  'scaffold N50',
  'scaffold L50'],
 ['GCA_002002945.1', 'Scaffold', 4568, 13404, 166092, 374],
 ['GCA_009761775.1', 'Chromosome', 823517, 121, 19846527, 7],
 ['GCA_033242285.1', 'Scaffold', 28777, 3106, 1066773, 69],
 ['GCA_042847195.1', 'Chromosome', 2043694, 48, 24737479, 6],
 ['GCA_042854195.1', 'Chromosome', 2369984, 44, 23853041, 6]]

In [12]:
%%bash

# берется из задания
export TAXONOMY="274614"

/opt/bin/datasets summary taxonomy taxon "$TAXONOMY" --children 2>&1 | tail -1 > taxonomy.summary.json

In [13]:
taxonomy = json.load(pl.Path('taxonomy.summary.json').open())['reports'][0]['taxonomy']['classification']

In [14]:
order = [
    "superkingdom",
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species",
]

for current in order:
    print(taxonomy[current]['name'])


Eukaryota
Metazoa
Arthropoda
Insecta
Orthoptera
Acrididae
Schistocerca
Schistocerca cancellata
