# GFF2Parquet Testing Notebook
This notebook demonstrates the various features of the `gff2parquet` CLI tool.




## Setup and Data Paths

In [None]:
from pathlib import Path
import subprocess
import sys
import os
import polars as pl
import pyarrow.parquet as pq


# Define data paths
DATA_DIR = Path("data")
GFF_DIR = DATA_DIR / "downloaded_gff"
FASTA_DIR = DATA_DIR / "downloaded_fasta"
OUTPUT_DIR = DATA_DIR / "test_outputs"

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# print(f"current directory: {os.getcwd()}")
# Helper function to run CLI commands
def run_gff2parquet(args):
    """Run gff2parquet command and print output."""
    cmd = ["gff2parquet"] + args
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.stdout:
        print(result.stdout)
    if result.stderr:
        print(result.stderr, file=sys.stderr)
    if result.returncode != 0:
        print(f"Command failed with return code {result.returncode}", file=sys.stderr)
    return result

current directory: /clusterfs/jgi/scratch/science/metagen/neri/code/blits/gff2parquet/data


## 1. Print Command - Inspect GFF Files
Let's start by examining the first GFF file to understand its structure.


In [2]:
# Print first 10 rows of a GFF file
run_gff2parquet([
    "print",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--head", "10",
    "--format", "table"
])

Running: gff2parquet print data/downloaded_gff/groupI_GCA_000859985.2.gff --head 10 --format table
shape: (10, 10)
┌────────────┬─────────┬───────────────┬───────┬───┬────────┬───────┬───────────────┬──────────────┐
│ seqid      ┆ source  ┆ type          ┆ start ┆ … ┆ strand ┆ phase ┆ attributes    ┆ source_file  │
│ ---        ┆ ---     ┆ ---           ┆ ---   ┆   ┆ ---    ┆ ---   ┆ ---           ┆ ---          │
│ str        ┆ str     ┆ str           ┆ u32   ┆   ┆ str    ┆ u32   ┆ list[struct[2 ┆ str          │
│            ┆         ┆               ┆       ┆   ┆        ┆       ┆ ]]            ┆              │
╞════════════╪═════════╪═══════════════╪═══════╪═══╪════════╪═══════╪═══════════════╪══════════════╡
│ JN555585.1 ┆ Genbank ┆ region        ┆ 1     ┆ … ┆ +      ┆ null  ┆ [{"ID","JN555 ┆ data/downloa │
│            ┆         ┆               ┆       ┆   ┆        ┆       ┆ 585.1:1..1522 ┆ ded_gff/grou │
│            ┆         ┆               ┆       ┆   ┆        ┆       ┆ 22"}…  

Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff



CompletedProcess(args=['gff2parquet', 'print', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--head', '10', '--format', 'table'], returncode=0, stdout='shape: (10, 10)\n┌────────────┬─────────┬───────────────┬───────┬───┬────────┬───────┬───────────────┬──────────────┐\n│ seqid      ┆ source  ┆ type          ┆ start ┆ … ┆ strand ┆ phase ┆ attributes    ┆ source_file  │\n│ ---        ┆ ---     ┆ ---           ┆ ---   ┆   ┆ ---    ┆ ---   ┆ ---           ┆ ---          │\n│ str        ┆ str     ┆ str           ┆ u32   ┆   ┆ str    ┆ u32   ┆ list[struct[2 ┆ str          │\n│            ┆         ┆               ┆       ┆   ┆        ┆       ┆ ]]            ┆              │\n╞════════════╪═════════╪═══════════════╪═══════╪═══╪════════╪═══════╪═══════════════╪══════════════╡\n│ JN555585.1 ┆ Genbank ┆ region        ┆ 1     ┆ … ┆ +      ┆ null  ┆ [{"ID","JN555 ┆ data/downloa │\n│            ┆         ┆               ┆       ┆   ┆        ┆       ┆ 585.1:1..1522 ┆ ded_gff/grou │\n│         

### Show statistics about feature types


In [3]:
run_gff2parquet([
    "print",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--stats"
])

Running: gff2parquet print data/downloaded_gff/groupI_GCA_000859985.2.gff --stats
shape: (316, 10)
┌────────────┬─────────┬───────────────┬────────┬───┬────────┬───────┬──────────────┬──────────────┐
│ seqid      ┆ source  ┆ type          ┆ start  ┆ … ┆ strand ┆ phase ┆ attributes   ┆ source_file  │
│ ---        ┆ ---     ┆ ---           ┆ ---    ┆   ┆ ---    ┆ ---   ┆ ---          ┆ ---          │
│ str        ┆ str     ┆ str           ┆ u32    ┆   ┆ str    ┆ u32   ┆ list[struct[ ┆ str          │
│            ┆         ┆               ┆        ┆   ┆        ┆       ┆ 2]]          ┆              │
╞════════════╪═════════╪═══════════════╪════════╪═══╪════════╪═══════╪══════════════╪══════════════╡
│ JN555585.1 ┆ Genbank ┆ region        ┆ 1      ┆ … ┆ +      ┆ null  ┆ [{"ID","JN55 ┆ data/downloa │
│            ┆         ┆               ┆        ┆   ┆        ┆       ┆ 5585.1:1..15 ┆ ded_gff/grou │
│            ┆         ┆               ┆        ┆   ┆        ┆       ┆ 2222"}…      ┆ pI_GCA…

Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff

--- Statistics ---
Total rows: 316
Total columns: 10

Feature types:
shape: (12, 2)
┌───────────────────────┬───────┐
│ type                  ┆ count │
│ ---                   ┆ ---   │
│ str                   ┆ u32   │
╞═══════════════════════╪═══════╡
│ CDS                   ┆ 82    │
│ gene                  ┆ 79    │
│ polyA_signal_sequence ┆ 53    │
│ exon                  ┆ 27    │
│ repeat_region         ┆ 26    │
│ …                     ┆ …     │
│ inverted_repeat       ┆ 5     │
│ stem_loop             ┆ 3     │
│ sequence_feature      ┆ 2     │
│ ncRNA                 ┆ 2     │
│ region                ┆ 1     │
└───────────────────────┴───────┘



CompletedProcess(args=['gff2parquet', 'print', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--stats'], returncode=0, stdout='shape: (316, 10)\n┌────────────┬─────────┬───────────────┬────────┬───┬────────┬───────┬──────────────┬──────────────┐\n│ seqid      ┆ source  ┆ type          ┆ start  ┆ … ┆ strand ┆ phase ┆ attributes   ┆ source_file  │\n│ ---        ┆ ---     ┆ ---           ┆ ---    ┆   ┆ ---    ┆ ---   ┆ ---          ┆ ---          │\n│ str        ┆ str     ┆ str           ┆ u32    ┆   ┆ str    ┆ u32   ┆ list[struct[ ┆ str          │\n│            ┆         ┆               ┆        ┆   ┆        ┆       ┆ 2]]          ┆              │\n╞════════════╪═════════╪═══════════════╪════════╪═══╪════════╪═══════╪══════════════╪══════════════╡\n│ JN555585.1 ┆ Genbank ┆ region        ┆ 1      ┆ … ┆ +      ┆ null  ┆ [{"ID","JN55 ┆ data/downloa │\n│            ┆         ┆               ┆        ┆   ┆        ┆       ┆ 5585.1:1..15 ┆ ded_gff/grou │\n│            ┆         ┆           

### Filter and display only CDS features


In [4]:
run_gff2parquet([
    "print",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--type", "CDS",
    "--head", "5",
    "--columns", "seqid,type,start,end,strand",
    "--format", "csv"
])

Running: gff2parquet print data/downloaded_gff/groupI_GCA_000859985.2.gff --type CDS --head 5 --columns seqid,type,start,end,strand --format csv
seqid	type	start	end	strand
JN555585.1	CDS	513	1259	+
JN555585.1	CDS	2262	2318	+
JN555585.1	CDS	3084	3750	+
JN555585.1	CDS	3887	5490	+
JN555585.1	CDS	9338	10012	+



Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff



CompletedProcess(args=['gff2parquet', 'print', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--type', 'CDS', '--head', '5', '--columns', 'seqid,type,start,end,strand', '--format', 'csv'], returncode=0, stdout='seqid\ttype\tstart\tend\tstrand\nJN555585.1\tCDS\t513\t1259\t+\nJN555585.1\tCDS\t2262\t2318\t+\nJN555585.1\tCDS\t3084\t3750\t+\nJN555585.1\tCDS\t3887\t5490\t+\nJN555585.1\tCDS\t9338\t10012\t+\n', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\n")


## 2. Convert Command - GFF to Parquet
Convert individual GFF files to Parquet format for efficient processing.


In [5]:
run_gff2parquet([
    "convert",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "-o", str(OUTPUT_DIR / "groupI.parquet")
])

Running: gff2parquet convert data/downloaded_gff/groupI_GCA_000859985.2.gff -o data/test_outputs/groupI.parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Writing Parquet to data/test_outputs/groupI.parquet...
Done!



CompletedProcess(args=['gff2parquet', 'convert', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '-o', 'data/test_outputs/groupI.parquet'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nWriting Parquet to data/test_outputs/groupI.parquet...\nDone!\n")

### Convert with column normalization


In [27]:
run_gff2parquet([
    "convert",
    str(GFF_DIR / "groupII_GCA_031099375.1.gff"),
    "--normalize",
    "-o", str(OUTPUT_DIR / "groupII_normalized.parquet")
])
print(f"headers from reading the output file: {pq.read_metadata(OUTPUT_DIR / 'groupII_normalized.parquet')}")

Running: gff2parquet convert data/downloaded_gff/groupII_GCA_031099375.1.gff --normalize -o data/test_outputs/groupII_normalized.parquet
headers from reading the output file: <pyarrow._parquet.FileMetaData object at 0x7fa3f842d6d0>
  created_by: Polars
  num_columns: 11
  num_rows: 7
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2060


Found 1 file(s) matching pattern 'data/downloaded_gff/groupII_GCA_031099375.1.gff'
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Normalizing column names...
Writing Parquet to data/test_outputs/groupII_normalized.parquet...
Done!




### Convert with coordinate shifting (0-based to 1-based)


In [7]:
run_gff2parquet([
    "convert",
    str(GFF_DIR / "groupIII_GCA_000880735.1.gff"),
    "--shift-start", "1",
    "-o", str(OUTPUT_DIR / "groupIII_shifted.parquet")
])

Running: gff2parquet convert data/downloaded_gff/groupIII_GCA_000880735.1.gff --shift-start 1 -o data/test_outputs/groupIII_shifted.parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupIII_GCA_000880735.1.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Shifting coordinates (start: 1, end: 0)...
Writing Parquet to data/test_outputs/groupIII_shifted.parquet...
Done!



CompletedProcess(args=['gff2parquet', 'convert', 'data/downloaded_gff/groupIII_GCA_000880735.1.gff', '--shift-start', '1', '-o', 'data/test_outputs/groupIII_shifted.parquet'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupIII_GCA_000880735.1.gff'\nScanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff\nShifting coordinates (start: 1, end: 0)...\nWriting Parquet to data/test_outputs/groupIII_shifted.parquet...\nDone!\n")


## 3. Merge Command - Combine Multiple GFF Files

Merge all GFF files from a directory into a single file.


In [28]:
run_gff2parquet([
    "merge",
    str(GFF_DIR / "*.gff"),
    "-o", str(OUTPUT_DIR / "all_merged.parquet")
])
print(f"total number of records from reading the output file: {pq.read_metadata(OUTPUT_DIR / 'all_merged.parquet').num_rows}")

Running: gff2parquet merge data/downloaded_gff/*.gff -o data/test_outputs/all_merged.parquet
total number of records from reading the output file: 444


Merging 1 input pattern(s)...
Found 8 file(s) matching pattern 'data/downloaded_gff/*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Scanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff
Scanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff
Scanning: data/downloaded_gff/groupV_GCA_053294245.1.gff
Scanning: data/downloaded_gff/groupcirular_rna_GCA_050924405.1.gff
Writing Parquet to data/test_outputs/all_merged.parquet...
Done!



### Merge with normalization and output as csv

In [None]:
run_gff2parquet([
    "merge",
    str(GFF_DIR / "group*.gff"),
    "--normalize",
    "-f", "csv",  # Changed from "pyarrow" to "csv"
    "-o", str(OUTPUT_DIR / "merged_normalized.csv")
])

Running: gff2parquet merge data/downloaded_gff/group*.gff --normalize -f csv -o data/test_outputs/merged_normalized.csv


Merging 1 input pattern(s)...
Found 8 file(s) matching pattern 'data/downloaded_gff/group*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Scanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff
Scanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff
Scanning: data/downloaded_gff/groupV_GCA_053294245.1.gff
Scanning: data/downloaded_gff/groupcirular_rna_GCA_050924405.1.gff
Normalizing column names...
Writing CSV...
Done!



CompletedProcess(args=['gff2parquet', 'merge', 'data/downloaded_gff/group*.gff', '--normalize', '-f', 'csv', '-o', 'data/test_outputs/merged_normalized.csv'], returncode=0, stdout='', stderr="Merging 1 input pattern(s)...\nFound 8 file(s) matching pattern 'data/downloaded_gff/group*.gff'\nScanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff\nScanning: data/downloaded_gff/groupII_GCA_031099375.1.gff\nScanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nScanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff\nScanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff\nScanning: data/downloaded_gff/groupV_GCA_053294245.1.gff\nScanning: data/downloaded_gff/groupcirular_rna_GCA_050924405.1.gff\nNormalizing column names...\nWriting CSV...\nDone!\n")


## 4. Filter Command - Extract Specific Features




### Filter CDS features only

In [11]:
run_gff2parquet([
    "filter",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--type", "CDS",
    "-o", str(OUTPUT_DIR / "cds_only.parquet")
])

Running: gff2parquet filter data/downloaded_gff/groupI_GCA_000859985.2.gff --type CDS -o data/test_outputs/cds_only.parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Writing Parquet to data/test_outputs/cds_only.parquet...
Done!



CompletedProcess(args=['gff2parquet', 'filter', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--type', 'CDS', '-o', 'data/test_outputs/cds_only.parquet'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nWriting Parquet to data/test_outputs/cds_only.parquet...\nDone!\n")

### Filter by minimum length

In [12]:
run_gff2parquet([
    "filter",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--type", "CDS",
    "--min-length", "500",
    "-o", str(OUTPUT_DIR / "long_cds.csv"),
    "-f", "csv"
])

Running: gff2parquet filter data/downloaded_gff/groupI_GCA_000859985.2.gff --type CDS --min-length 500 -o data/test_outputs/long_cds.csv -f csv


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Writing CSV...
Done!



CompletedProcess(args=['gff2parquet', 'filter', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--type', 'CDS', '--min-length', '500', '-o', 'data/test_outputs/long_cds.csv', '-f', 'csv'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nWriting CSV...\nDone!\n")

### Filter by strand and length range

In [13]:
run_gff2parquet([
    "filter",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--type", "gene",
    "--strand", "+",
    "--min-length", "300",
    "--max-length", "3000",
    "-o", str(OUTPUT_DIR / "filtered_genes.parquet")
])


Running: gff2parquet filter data/downloaded_gff/groupI_GCA_000859985.2.gff --type gene --strand + --min-length 300 --max-length 3000 -o data/test_outputs/filtered_genes.parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Writing Parquet to data/test_outputs/filtered_genes.parquet...
Done!



CompletedProcess(args=['gff2parquet', 'filter', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--type', 'gene', '--strand', '+', '--min-length', '300', '--max-length', '3000', '-o', 'data/test_outputs/filtered_genes.parquet'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nWriting Parquet to data/test_outputs/filtered_genes.parquet...\nDone!\n")


## 5. Split Command - Separate by Column Values

Split GFF data into separate files based on column values.


### Split by feature type

In [14]:
run_gff2parquet([
    "split",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--column", "type",
    "--output-dir", str(OUTPUT_DIR / "split_by_type"),
    "-f", "parquet"
])



Running: gff2parquet split data/downloaded_gff/groupI_GCA_000859985.2.gff --column type --output-dir data/test_outputs/split_by_type -f parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Splitting into 12 files by 'type'
Wrote 26 rows to data/test_outputs/split_by_type/type_repeat_region.parquet
Wrote 17 rows to data/test_outputs/split_by_type/type_mRNA.parquet
Wrote 5 rows to data/test_outputs/split_by_type/type_inverted_repeat.parquet
Wrote 3 rows to data/test_outputs/split_by_type/type_stem_loop.parquet
Wrote 53 rows to data/test_outputs/split_by_type/type_polyA_signal_sequence.parquet
Wrote 2 rows to data/test_outputs/split_by_type/type_sequence_feature.parquet
Wrote 1 rows to data/test_outputs/split_by_type/type_region.parquet
Wrote 27 rows to data/test_outputs/split_by_type/type_exon.parquet
Wrote 19 rows to data/test_outputs/split_by_type/type_TATA_box.parquet
Wrote 82 rows to data/test_outputs/split_by_type/type_CDS.parquet
Wrote 2 rows to data/test_outputs/split_by_type/type_ncRNA.parquet
Wrote 79 rows to data/test_outputs/

CompletedProcess(args=['gff2parquet', 'split', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--column', 'type', '--output-dir', 'data/test_outputs/split_by_type', '-f', 'parquet'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nSplitting into 12 files by 'type'\nWrote 26 rows to data/test_outputs/split_by_type/type_repeat_region.parquet\nWrote 17 rows to data/test_outputs/split_by_type/type_mRNA.parquet\nWrote 5 rows to data/test_outputs/split_by_type/type_inverted_repeat.parquet\nWrote 3 rows to data/test_outputs/split_by_type/type_stem_loop.parquet\nWrote 53 rows to data/test_outputs/split_by_type/type_polyA_signal_sequence.parquet\nWrote 2 rows to data/test_outputs/split_by_type/type_sequence_feature.parquet\nWrote 1 rows to data/test_outputs/split_by_type/type_region.parquet\nWrote 27 rows to data/test_outputs/split_by_type/type_exon.parquet\nWrote 19 

### Split by sequence ID (chromosome/contig)

In [15]:
run_gff2parquet([
    "split",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--column", "seqid",
    "--output-dir", str(OUTPUT_DIR / "split_by_seqid"),
    "-f", "gff"
])


Running: gff2parquet split data/downloaded_gff/groupI_GCA_000859985.2.gff --column seqid --output-dir data/test_outputs/split_by_seqid -f gff


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Splitting into 1 files by 'seqid'
Wrote 316 rows to data/test_outputs/split_by_seqid/seqid_JN555585.1.gff3
Done!



CompletedProcess(args=['gff2parquet', 'split', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', '--column', 'seqid', '--output-dir', 'data/test_outputs/split_by_seqid', '-f', 'gff'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nSplitting into 1 files by 'seqid'\nWrote 316 rows to data/test_outputs/split_by_seqid/seqid_JN555585.1.gff3\nDone!\n")


## 6. Extract Command - Get Sequences from FASTA

### Extract CDS sequences as nucleotides


In [16]:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    str(FASTA_DIR / "groupI_GCA_000859985.2.fna"),
    "--type", "CDS",
    "-o", str(OUTPUT_DIR / "cds_sequences.fasta")
])

Running: gff2parquet extract data/downloaded_gff/groupI_GCA_000859985.2.gff data/downloaded_fasta/groupI_GCA_000859985.2.fna --type CDS -o data/test_outputs/cds_sequences.fasta


Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff
Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Found 1 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 304.49rows/s]
Extracting 82 features...
Extracted 82 sequences
Done!



CompletedProcess(args=['gff2parquet', 'extract', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', 'data/downloaded_fasta/groupI_GCA_000859985.2.fna', '--type', 'CDS', '-o', 'data/test_outputs/cds_sequences.fasta'], returncode=0, stdout='', stderr="Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff\nFound 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nFound 1 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 304.49rows/s]\nExtracting 82 features...\nExtracted 82 sequences\nDone!\n")

### Extract and translate CDS to proteins

In [None]:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    str(FASTA_DIR / "groupI_GCA_000859985.2.fna"),
    "--type", "CDS",
    "--outaa", "amino",
    "--genetic-code", "11",
    "-o", str(OUTPUT_DIR / "cds_proteins.fasta")
])

Running: gff2parquet extract data/downloaded_gff/groupI_GCA_000859985.2.gff data/downloaded_fasta/groupI_GCA_000859985.2.fna --type CDS --outaa amino --genetic-code 11 -o data/test_outputs/cds_proteins.fasta


Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff
Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Found 1 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 359.32rows/s]
Extracting 82 features...
Extracted 82 sequences
Done!



CompletedProcess(args=['gff2parquet', 'extract', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', 'data/downloaded_fasta/groupI_GCA_000859985.2.fna', '--type', 'CDS', '--outaa', 'amino', '--genetic-code', '11', '-o', 'data/test_outputs/cds_proteins.fasta'], returncode=0, stdout='', stderr="Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff\nFound 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nFound 1 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 359.32rows/s]\nExtracting 82 features...\nExtracted 82 sequences\nDone!\n")

### Extract long CDS and translate

In [None]:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    str(FASTA_DIR / "groupI_GCA_000859985.2.fna"),
    "--type", "CDS",
    "--min-length", "500",
    "--outaa", "amino",
    "-o", str(OUTPUT_DIR / "long_proteins.fasta")
])

Running: gff2parquet extract data/downloaded_gff/groupI_GCA_000859985.2.gff data/downloaded_fasta/groupI_GCA_000859985.2.fna --type CDS --min-length 500 --outaa amino -o data/test_outputs/long_proteins.fasta


Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff
Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Found 1 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 353.26rows/s]
Extracting 71 features...
Extracted 71 sequences
Done!



CompletedProcess(args=['gff2parquet', 'extract', 'data/downloaded_gff/groupI_GCA_000859985.2.gff', 'data/downloaded_fasta/groupI_GCA_000859985.2.fna', '--type', 'CDS', '--min-length', '500', '--outaa', 'amino', '-o', 'data/test_outputs/long_proteins.fasta'], returncode=0, stdout='', stderr="Loading GFF from: data/downloaded_gff/groupI_GCA_000859985.2.gff\nFound 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nFound 1 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 353.26rows/s]\nExtracting 71 features...\nExtracted 71 sequences\nDone!\n")

### Extract from multiple genomes


In [19]:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "groupI*.gff"),
    str(FASTA_DIR / "groupI_GCA_000859985.2.fna"),
    str(FASTA_DIR / "groupII_GCA_031099375.1.fna"),
    "--type", "CDS",
    "--outaa", "amino",
    "-f", "parquet",
    "-o", str(OUTPUT_DIR / "multi_genome_proteins.parquet")
])


Running: gff2parquet extract data/downloaded_gff/groupI*.gff data/downloaded_fasta/groupI_GCA_000859985.2.fna data/downloaded_fasta/groupII_GCA_031099375.1.fna --type CDS --outaa amino -f parquet -o data/test_outputs/multi_genome_proteins.parquet


Loading GFF from: data/downloaded_gff/groupI*.gff
Found 4 file(s) matching pattern 'data/downloaded_gff/groupI*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Found 2 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 332.04rows/s]
  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 759.29rows/s]
Extracting 101 features...
Extracted 88 sequences
Done!



CompletedProcess(args=['gff2parquet', 'extract', 'data/downloaded_gff/groupI*.gff', 'data/downloaded_fasta/groupI_GCA_000859985.2.fna', 'data/downloaded_fasta/groupII_GCA_031099375.1.fna', '--type', 'CDS', '--outaa', 'amino', '-f', 'parquet', '-o', 'data/test_outputs/multi_genome_proteins.parquet'], returncode=0, stdout='', stderr="Loading GFF from: data/downloaded_gff/groupI*.gff\nFound 4 file(s) matching pattern 'data/downloaded_gff/groupI*.gff'\nScanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff\nScanning: data/downloaded_gff/groupII_GCA_031099375.1.gff\nScanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nApplying filters...\nFound 2 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 332.04rows/s]\n  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 759.29rows/s]\nExtrac

### extract from multiple genomes using glob pattern

In [20]:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "groupI*.gff"),
    str(FASTA_DIR / "group*.fna"),  # Glob pattern
    "--type", "CDS",
    "-o", str(OUTPUT_DIR / "all_cds.fasta")
])

# Or multiple patterns:
run_gff2parquet([
    "extract",
    str(GFF_DIR / "*.gff"),
    str(FASTA_DIR / "groupI*.fna"),
    str(FASTA_DIR / "groupII*.fna"),
    "--type", "CDS",
    "--outaa", "amino",
    "-o", str(OUTPUT_DIR / "selected_proteins.fasta")
])

Running: gff2parquet extract data/downloaded_gff/groupI*.gff data/downloaded_fasta/group*.fna --type CDS -o data/test_outputs/all_cds.fasta


Loading GFF from: data/downloaded_gff/groupI*.gff
Found 4 file(s) matching pattern 'data/downloaded_gff/groupI*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Found 8 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupIII_GCA_000880735.1.fna

0rows [00:00, ?rows/s]
11rows [00:00, 3300.48rows/s]
  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 1108.72rows/s]
  Reading: data/downloaded_fasta/groupIV_GCA_031102545.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 1061.31rows/s]
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 753.83rows/s]
  Reading: data/downloaded_fasta/groupVII_GCA_031171435.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 695.80rows/s]
  Reading: d

Running: gff2parquet extract data/downloaded_gff/*.gff data/downloaded_fasta/groupI*.fna data/downloaded_fasta/groupII*.fna --type CDS --outaa amino -o data/test_outputs/selected_proteins.fasta


Loading GFF from: data/downloaded_gff/*.gff
Found 8 file(s) matching pattern 'data/downloaded_gff/*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Scanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff
Scanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff
Scanning: data/downloaded_gff/groupV_GCA_053294245.1.gff
Scanning: data/downloaded_gff/groupcirular_rna_GCA_050924405.1.gff
Applying filters...
Found 6 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupIII_GCA_000880735.1.fna

0rows [00:00, ?rows/s]
11rows [00:00, 3766.31rows/s]
  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 1053.05rows/s]
  Reading: data/downloaded_fasta/groupIV_GCA_031102545.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 1071.62rows/s]
  Reading

CompletedProcess(args=['gff2parquet', 'extract', 'data/downloaded_gff/*.gff', 'data/downloaded_fasta/groupI*.fna', 'data/downloaded_fasta/groupII*.fna', '--type', 'CDS', '--outaa', 'amino', '-o', 'data/test_outputs/selected_proteins.fasta'], returncode=0, stdout='', stderr="Loading GFF from: data/downloaded_gff/*.gff\nFound 8 file(s) matching pattern 'data/downloaded_gff/*.gff'\nScanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff\nScanning: data/downloaded_gff/groupII_GCA_031099375.1.gff\nScanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff\nScanning: data/downloaded_gff/groupI_GCA_000859985.2.gff\nScanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff\nScanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff\nScanning: data/downloaded_gff/groupV_GCA_053294245.1.gff\nScanning: data/downloaded_gff/groupcirular_rna_GCA_050924405.1.gff\nApplying filters...\nFound 6 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupIII_GCA_000880735.1.fna\n\n


## 7. Complex Workflows - Combining Commands

Demonstrate multi-step analysis workflows.


### Workflow 1: Merge → Filter → Extract

In [21]:
# Step 1: Merge all annotations
run_gff2parquet([
    "merge",
    str(GFF_DIR / "group[I-IV]*.gff"),
    "-o", str(OUTPUT_DIR / "workflow1_merged.parquet")
])

# Step 2: Filter for long CDS
run_gff2parquet([
    "filter",
    str(OUTPUT_DIR / "workflow1_merged.parquet"),
    "--type", "CDS",
    "--min-length", "600",
    "-o", str(OUTPUT_DIR / "workflow1_filtered.gff"),
    "-f", "gff"
])

# Step 3: Extract and translate
run_gff2parquet([
    "extract",
    str(OUTPUT_DIR / "workflow1_filtered.gff"),
    str(FASTA_DIR / "groupI_GCA_000859985.2.fna"),
    str(FASTA_DIR / "groupII_GCA_031099375.1.fna"),
    str(FASTA_DIR / "groupIII_GCA_000880735.1.fna"),
    str(FASTA_DIR / "groupIV_GCA_031102545.1.fna"),
    "--outaa", "amino",
    "-o", str(OUTPUT_DIR / "workflow1_proteins.fasta")
])


Running: gff2parquet merge data/downloaded_gff/group[I-IV]*.gff -o data/test_outputs/workflow1_merged.parquet


Merging 1 input pattern(s)...
Found 7 file(s) matching pattern 'data/downloaded_gff/group[I-IV]*.gff'
Scanning: data/downloaded_gff/groupIII_GCA_000880735.1.gff
Scanning: data/downloaded_gff/groupII_GCA_031099375.1.gff
Scanning: data/downloaded_gff/groupIV_GCA_031102545.1.gff
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Scanning: data/downloaded_gff/groupVII_GCA_031171435.1.gff
Scanning: data/downloaded_gff/groupVI_GCA_000864765.1.gff
Scanning: data/downloaded_gff/groupV_GCA_053294245.1.gff
Writing Parquet to data/test_outputs/workflow1_merged.parquet...
Done!



Running: gff2parquet filter data/test_outputs/workflow1_merged.parquet --type CDS --min-length 600 -o data/test_outputs/workflow1_filtered.gff -f gff


Found 1 file(s) matching pattern 'data/test_outputs/workflow1_merged.parquet'
Scanning: data/test_outputs/workflow1_merged.parquet
Applying filters...
Writing GFF3 to data/test_outputs/workflow1_filtered.gff...
Done!



Running: gff2parquet extract data/test_outputs/workflow1_filtered.gff data/downloaded_fasta/groupI_GCA_000859985.2.fna data/downloaded_fasta/groupII_GCA_031099375.1.fna data/downloaded_fasta/groupIII_GCA_000880735.1.fna data/downloaded_fasta/groupIV_GCA_031102545.1.fna --outaa amino -o data/test_outputs/workflow1_proteins.fasta


Loading GFF from: data/test_outputs/workflow1_filtered.gff
Found 1 file(s) matching pattern 'data/test_outputs/workflow1_filtered.gff'
Scanning: data/test_outputs/workflow1_filtered.gff
Found 4 FASTA file(s)
Loading FASTA sequences...
  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna

0rows [00:00, ?rows/s]
1rows [00:00, 300.28rows/s]
  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 991.56rows/s]
  Reading: data/downloaded_fasta/groupIII_GCA_000880735.1.fna

0rows [00:00, ?rows/s]
11rows [00:00, 10514.44rows/s]
  Reading: data/downloaded_fasta/groupIV_GCA_031102545.1.fna

0rows [00:00, ?rows/s]
1rows [00:00, 836.02rows/s]
Extracting 96 features...
Extracted 82 sequences
Done!



CompletedProcess(args=['gff2parquet', 'extract', 'data/test_outputs/workflow1_filtered.gff', 'data/downloaded_fasta/groupI_GCA_000859985.2.fna', 'data/downloaded_fasta/groupII_GCA_031099375.1.fna', 'data/downloaded_fasta/groupIII_GCA_000880735.1.fna', 'data/downloaded_fasta/groupIV_GCA_031102545.1.fna', '--outaa', 'amino', '-o', 'data/test_outputs/workflow1_proteins.fasta'], returncode=0, stdout='', stderr="Loading GFF from: data/test_outputs/workflow1_filtered.gff\nFound 1 file(s) matching pattern 'data/test_outputs/workflow1_filtered.gff'\nScanning: data/test_outputs/workflow1_filtered.gff\nFound 4 FASTA file(s)\nLoading FASTA sequences...\n  Reading: data/downloaded_fasta/groupI_GCA_000859985.2.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 300.28rows/s]\n  Reading: data/downloaded_fasta/groupII_GCA_031099375.1.fna\n\n0rows [00:00, ?rows/s]\n1rows [00:00, 991.56rows/s]\n  Reading: data/downloaded_fasta/groupIII_GCA_000880735.1.fna\n\n0rows [00:00, ?rows/s]\n11rows [00:00, 10514.44rows

### Workflow 2: Filter by type → Split by seqid

In [22]:
# Filter for genes
run_gff2parquet([
    "filter",
    str(GFF_DIR / "groupI_GCA_000859985.2.gff"),
    "--type", "gene",
    "-o", str(OUTPUT_DIR / "workflow2_genes.parquet")
])

# Split by seqid 
run_gff2parquet([
    "split",
    str(OUTPUT_DIR / "workflow2_genes.parquet"),
    "--column", "seqid",
    "--output-dir", str(OUTPUT_DIR / "workflow2_by_chromosome"),
    "-f", "gff"
])


Running: gff2parquet filter data/downloaded_gff/groupI_GCA_000859985.2.gff --type gene -o data/test_outputs/workflow2_genes.parquet


Found 1 file(s) matching pattern 'data/downloaded_gff/groupI_GCA_000859985.2.gff'
Scanning: data/downloaded_gff/groupI_GCA_000859985.2.gff
Applying filters...
Writing Parquet to data/test_outputs/workflow2_genes.parquet...
Done!



Running: gff2parquet split data/test_outputs/workflow2_genes.parquet --column seqid --output-dir data/test_outputs/workflow2_by_chromosome -f gff


Found 1 file(s) matching pattern 'data/test_outputs/workflow2_genes.parquet'
Scanning: data/test_outputs/workflow2_genes.parquet
Splitting into 1 files by 'seqid'
Wrote 79 rows to data/test_outputs/workflow2_by_chromosome/seqid_JN555585.1.gff3
Done!



CompletedProcess(args=['gff2parquet', 'split', 'data/test_outputs/workflow2_genes.parquet', '--column', 'seqid', '--output-dir', 'data/test_outputs/workflow2_by_chromosome', '-f', 'gff'], returncode=0, stdout='', stderr="Found 1 file(s) matching pattern 'data/test_outputs/workflow2_genes.parquet'\nScanning: data/test_outputs/workflow2_genes.parquet\nSplitting into 1 files by 'seqid'\nWrote 79 rows to data/test_outputs/workflow2_by_chromosome/seqid_JN555585.1.gff3\nDone!\n")

## 8. Verification - Check Outputs

Verify that output files were created successfully.


In [23]:
print("Output files created:")
for root, dirs, files in os.walk(OUTPUT_DIR):
    level = root.replace(str(OUTPUT_DIR), '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        size = os.path.getsize(os.path.join(root, file))
        print(f'{subindent}{file} ({size:,} bytes)')


Output files created:
test_outputs/
  all_cds.fasta (157,604 bytes)
  all_merged.parquet (17,572 bytes)
  cds_only.parquet (7,164 bytes)
  cds_proteins.fasta (44,014 bytes)
  cds_sequences.fasta (126,208 bytes)
  filtered_genes.parquet (5,206 bytes)
  groupI.parquet (12,870 bytes)
  groupIII_shifted.parquet (5,513 bytes)
  groupII_normalized.parquet (4,554 bytes)
  long_cds.csv (17,697 bytes)
  long_proteins.fasta (42,551 bytes)
  merged_normalized.csv (91,311 bytes)
  multi_genome_proteins.parquet (711,328 bytes)
  selected_proteins.fasta (63,492 bytes)
  workflow1_filtered.gff (18,341 bytes)
  workflow1_merged.parquet (17,320 bytes)
  workflow1_proteins.fasta (52,149 bytes)
  workflow2_genes.parquet (5,589 bytes)
  split_by_seqid/
    seqid_JN555585.1.gff3 (48,783 bytes)
  split_by_type/
    type_CDS.parquet (7,164 bytes)
    type_TATA_box.parquet (4,638 bytes)
    type_exon.parquet (4,907 bytes)
    type_gene.parquet (5,589 bytes)
    type_inverted_repeat.parquet (4,426 bytes)
    t

## Summary

This notebook demonstrated:
- **print**: Inspecting GFF data and statistics
- **convert**: Converting GFF to Parquet/CSV with normalization and coordinate shifting
- **merge**: Combining multiple GFF files
- **filter**: Extracting features by type, length, strand, etc.
- **split**: Separating data into multiple files by column values
- **extract**: Extracting and optionally translating sequences from FASTA
- Complex multi-step workflows combining multiple commands
