In [None]:
import pyranges as pr

  import pkg_resources


In [2]:
dm6 = pr.read_gtf('Drosophila_melanogaster.BDGP6.46.113.gtf.gz')

In [3]:
# Filter the data for cannonical chromosomes
chromosomes = ['2L', '2R', '3L', '3R', '4', 'X', 'Y']

# Filter the data for the specified chromosomes
dm6 = dm6[dm6.Chromosome.isin(chromosomes)]

# Define positive and negative biotypes. Positive `y=True` and negative `y=False` for classification
pos_biotype = 'ncRNA'
neg_biotype = 'protein_coding'

# Pandas API selectors
# Split by positive and negative biotypes ( genes OR transcripts )
ncrna_genes = ((dm6.gene_biotype == pos_biotype))
pc_genes = ((dm6.gene_biotype == neg_biotype))

# Filter the data for the specified positive and negative biotypes
ncr = dm6[ncrna_genes]
pcg = dm6[pc_genes]

# Select the features of interest (transcripts & exons) and filter by biotype

# To get lncRNA, filter transcripts by length >= 200bp
lncrna_transcripts = ((ncr.Feature == 'transcript') & (ncr.transcript_biotype == pos_biotype) & (ncr.End - ncr.Start > 200))
# Select exons that are part of the lncRNA transcripts
lncrna_exons = ((ncr.Feature == 'exon') & (ncr.transcript_id.isin(ncr[lncrna_transcripts].transcript_id)))

# Get protein coding transcripts
pcg_transcripts = ((pcg.Feature == 'transcript') & (pcg.transcript_biotype == neg_biotype) & (pcg.End - pcg.Start > 10))
# Select exons that are part of the protein coding transcripts
pcg_exons = ((pcg.Feature == 'exon') & (pcg.transcript_id.isin(pcg[pcg_transcripts].transcript_id)))

In [4]:
# Keep all filtered transcripts & exons
ncr_transcripts = ncr[ (lncrna_transcripts) ]
init_ncr_count = len(ncr_transcripts)
# Remove duplicates based on the chromosome, start, and end positions
ncr_transcripts = ncr_transcripts[~(ncr_transcripts.df.duplicated(subset=['Chromosome', 'Start', 'End']))]
dedup_ncr_count = len(ncr_transcripts)

pcg_transcripts = pcg[ (pcg_transcripts) ]
init_pcg_count = len(pcg_transcripts)
pcg_transcripts = pcg_transcripts[~(pcg_transcripts.df.duplicated(subset=['Chromosome', 'Start', 'End']))]
dedup_pcg_count = len(pcg_transcripts)

ncr_exons = ncr [ (lncrna_exons) ]
init_ncr_count += len(ncr_exons)
ncr_exons = ncr_exons[~(ncr_exons.df.duplicated(subset=['Chromosome', 'Start', 'End', 'transcript_id']))]
dedup_ncr_count += len(ncr_exons)
pcg_exons = pcg [ (pcg_exons) ]
init_pcg_count += len(pcg_exons)
pcg_exons = pcg_exons[~(pcg_exons.df.duplicated(subset=['Chromosome', 'Start', 'End', 'transcript_id']))]
dedup_pcg_count += len(pcg_exons)

ncr_base = pr.concat([ncr_transcripts, ncr_exons])
pcg_base = pr.concat([pcg_transcripts, pcg_exons])

# Deduplicate the data by removing duplicates based on the chromosome, start, and end positions
# When duplicated (transcript = exon) keep exons `keep='last'` as exons appear after transcripts in the GTF
# ncr_base = ncr_base[~(ncr_base.df.duplicated(subset=['Chromosome', 'Start', 'End'], keep="last"))]
# pcg_base = pcg_base[~(pcg_base.df.duplicated(subset=['Chromosome', 'Start', 'End'], keep="last"))]

In [5]:
# Output the difference in counts before and after deduplication
print(f"lncRNA transcripts: initial count = {init_ncr_count}, deduplicated count = {dedup_ncr_count}")
print(f"protein coding transcripts: initial count = {init_pcg_count}, deduplicated count = {dedup_pcg_count}")
print(f"Total removed lncRNA entries: {init_ncr_count - dedup_ncr_count} | Percentage: {((init_ncr_count - dedup_ncr_count)/init_ncr_count)*100:.2f}%")
print(f"Total removed protein coding entries: {init_pcg_count - dedup_pcg_count} | Percentage: {((init_pcg_count - dedup_pcg_count)/init_pcg_count)*100:.2f}%")

lncRNA transcripts: initial count = 7901, deduplicated count = 7708
protein coding transcripts: initial count = 214306, deduplicated count = 206955
Total removed lncRNA entries: 193 | Percentage: 2.44%
Total removed protein coding entries: 7351 | Percentage: 3.43%


In [6]:
ncr_base.df.shape, pcg_base.df.shape

((7708, 20), (206955, 20))

In [7]:
ncr_base.df[ncr_base.df.Feature == 'transcript'].shape, pcg_base.df[pcg_base.df.Feature == 'transcript'].shape

((2802, 20), (23419, 20))

In [None]:
# Save the filtered data to GTF files
ncr_base.to_gtf('ncr_base.gtf')
pcg_base.to_gtf('pcg_base.gtf')

In [9]:
# Save BED files
ncr_bed = ncr_base.to_bed('ncr_base.bed', keep=False, chain=True)
pcg_bed = pcg_base.to_bed('pcg_base.bed', keep=False, chain=True)

ncr_bed.Name = ncr_base.transcript_id
pcg_bed.Name = pcg_base.transcript_id

ncr_bed.to_bed('ncr_base.bed', keep=False)
pcg_bed.to_bed('pcg_base.bed', keep=False)

# Save parquet files
ncr_base.df.to_parquet('ncr_base.parquet', index=False)
pcg_base.df.to_parquet('pcg_base.parquet', index=False)