In [1]:
import pyparanoid.genomedb as gdb

In [2]:
### The pyparanoid.genomedb module contains functions for downloading and 
### organizing genomic data from Ensembl, NCBI, and local sources.

### Fasta protein data is stored in subfolder 'pep' and metadata is in
### flat file 'genome_metadata.txt'

### Folders are made for DNA fasta and Genbank files, but these aren't
### populated from Ensembl or NCBI sources to save disk space.


### Initialize a folder for the genome database
gdb.setupdirs("../test_genomedb")

Setting up test_genomedb


In [3]:
### The only argument needed is the path to the folder for genomic data
### Only AA fasta files will be downloaded, as well as some metadata.

### Default behavior is to download only 10 complete genomes at a time
### that haven't already been downloaded.

gdb.download_Ensembl_files("../test_genomedb")

Current release of EnsemblBacteria: release-37
10 new genomes to download found...exiting JSON parser...
	30 total JSON records parsed...
0 found in test_genomedb.
10 remaining to download.
Downloading 10 genome files...


In [4]:
### You can also download all of the genomes with certain names.
### This downloads anything with a matching genus or species - spelling counts!

### Also, set maxgen=None to download all genomes that fit your criteria.
### Note that this parses the whole Ensembl database and takes a while.

### Also also, set complete=False to download draft genomes as well

gdb.download_Ensembl_files("../test_genomedb", maxgen=None, \
    names="syringae,stutzeri,fluorescens", complete=False)

Current release of EnsemblBacteria: release-37
	10000 JSON records parsed.
	20000 JSON records parsed.
	30000 JSON records parsed.
	40000 JSON records parsed.
	44048 total JSON records parsed...
10 found in test_genomedb.
184 remaining to download.
Downloading 184 genome files...
	100 files downloaded.


In [6]:
### Alternatively you can use taxonomy IDs
### Consult http://bacteria.ensembl.org/species.html

### Specifying maxgen as an integer sets a limit on # of genomes.
### This can be useful for avoiding time-out errors.

gdb.download_Ensembl_files("../test_genomedb", maxgen=5, \
    taxids="178900,178901", complete=False)

Current release of EnsemblBacteria: release-37
5 new genomes to download found...exiting JSON parser...
	17 total JSON records parsed...
194 found in test_genomedb.
5 remaining to download.
Downloading 5 genome files...


In [7]:
### You can also download files from NCBI RefSeq database.

### Specify species names and taxids separated by commas.

### Use 'cpus' to utilize multiple threads to download genomes (default = 1)

gdb.download_Refseq_files("../test_genomedb", cpus=4, \
    names="herbaspirillum,azospirillum", taxids="294,178900,178901")

Downloading files for herbaspirillum...
	working on fasta files...
	working on protein-fasta files...
	working on assembly-stats files...
Downloading files for azospirillum...
	working on fasta files...
	working on protein-fasta files...
	working on assembly-stats files...
Downloading files for 294...
	working on fasta files...
	working on protein-fasta files...
	working on assembly-stats files...
Downloading files for 178900...
	working on fasta files...
	working on protein-fasta files...
	working on assembly-stats files...
Downloading files for 178901...
	working on fasta files...
	working on protein-fasta files...
	working on assembly-stats files...
133 genomes to process.
100 processed...
133 genomes processed and 97 added to test_genomedb.


In [8]:
### Once a genomic database folder has been initialized with Ensembl or Genbank data,
### you can add in-house genomes annotated with Prokka

### Specify path to genomedb, path to Prokka folder, and a species id - the species id
### can't be already in use in the genomedb

gdb.add_Prokka_genome("../test_genomedb", "../../assemblies/WCS365_prokka", \
    "pseudomonas_sp_wcs365")

Species ID is unique! Moving on...
Copying files for pseudomonas_sp_wcs365


In [9]:
### Specify 'taxid' argument to add a NCBI taxonomy code. If none specified, 
### defaults to "2" for Bacteria kingdom

gdb.add_Prokka_genome("../test_genomedb", "../../assemblies/CH267_prokka", \
    "pseudomonas_sp_ch267", taxid="294")

Species ID is unique! Moving on...
Copying files for pseudomonas_sp_ch267


In [10]:
### You can also get taxonomic information for each genome in the database
### This is stored in the flat file 'tax_info.txt'

gdb.get_taxonomy("../test_genomedb")

Extracting 298 taxonomy records from Entrez-NCBI...
200 records remaining...
100 records remaining...
0 records remaining...
Done!
