- Purpose:
    - this work uses the comprehensive primary assembly GENCODEv48 gtf
    - the GENCODEv48 transcripts fasta file contains transcripts not in the primary assembly
    - this can create agreement issues particularly with salmon read counts
    - this script writes a new fasta file which filtered to match the gtf 

In [None]:
#####################
# import statements #
#####################
from tqdm import tqdm
from Bio import SeqIO
import gzip
import pysam
import pandas as pd
import os

In [None]:
##########################
# User-Defined Variables #
##########################
# - define all variables below with paths to the required files
# - this should be the only cell that requires modification
# - the fastagz file should be the transcripts fastagz
# - both files should be compressed with bgzip and indexed

fastagz_path = ''
gtfgz_path = ''

In [None]:
# define output directory and transcript fasta name based on user-defined variables
out_dir = os.path.split(fastagz_path)[0]
fastagz_name = os.path.split(fastagz_path)[1].split('.fa.gz')[0]

# build a dataframe of all transcript ids in the reference gtf
gtfz = pysam.TabixFile(gtfgz_path)
gtfgz_transcript_list = []
for gtfgz_line in gtfz.fetch():
    gtfgz_rec = gtfgz_line.strip().split('\t')
    feat_type = gtfgz_rec[2].strip()
    if feat_type == 'transcript':
        gtfgz_transcript_list.append(gtfgz_rec[8].strip().split('transcript_id "')[1].strip().split('"')[0].strip())

gtfgz_df = pd.DataFrame({
    'transcript_id':gtfgz_transcript_list,
})

# verify that transcript id df seems correct
display(gtfgz_df)

In [None]:
# create a dataframe that includes all the transcript ids in the transcript fasta
fatagz_transcript_list = []
with gzip.open(fastagz_path, 'rt') as fastagz:
    for fa_rec in SeqIO.parse(fastagz, 'fasta'):
        transcript_id = fa_rec.id.strip().split('|')[0].strip()
        fatagz_transcript_list.append(transcript_id)

fastagz_df = pd.DataFrame({
    'transcript_id':fatagz_transcript_list,
})

# verify that transcript id df seems correct
# if the dataframes are different lengths it indicates that the gtf and fasta have different transcript sets
display(fastagz_df)

In [None]:
# merge the gtf and fasta transcript id datasets to create a dataframe containing only the shared transcript ids
shared_df = pd.merge(left=gtfgz_df, right=fastagz_df, on=['transcript_id'], how='inner')

# verify that transcript id df seems correct
# this dataframe should be the same length as the shorter of the two input dfs
display(shared_df)

In [None]:
# iterate through the fasta file and create a list of all fasta records that match the shared df
fa_rec_list = []
shared_transcript_set = set(shared_df['transcript_id'].values)
with gzip.open(fastagz_path, 'rt') as fastagz:
    for fa_rec in tqdm(SeqIO.parse(fastagz, 'fasta')):
        transcript_id = fa_rec.id.strip().split('|')[0].strip()
        transcript_id_set = {transcript_id}
        if len(transcript_id_set.intersection(shared_transcript_set)) > 0:
            fa_rec_list.append(fa_rec)

# the length of the record list should match the shared df
print(f'fasta records for filtered file: {len(fa_rec_list)}')

In [None]:
# write the list of shared records to a new fasta file
# all records in the new fata file will have corresponding annotation in the gtf
fasta_path = os.path.join(out_dir, f'{fastagz_name}.gtf-matched.fa')
out_fastagz_path = f'{fasta_path.split('.fa')[0]}.gz'

SeqIO.write(fa_rec_list, fasta_path, 'fasta')
pysam.tabix_compress(filename_in=fasta_path, filename_out=out_fastagz_path)
pysam.faidx(out_fastagz_path)
os.remove(path=fasta_path)