- Purpose:
    - create a transcript id to gene id two-column map with no headers that can be used for salmon
    - create a table with trascript id gene id and gene name useful for DEG and other downstream analyses

In [None]:
#####################
# import statements #
#####################
import pysam
import pandas as pd
from tqdm import tqdm
import os
import gzip
from Bio import SeqIO

In [None]:
##########################
# User-Defined Variables #
##########################
# - define all variables below with paths to the required files
# - this should be the only cell that requires modification
# - both files should be compressed with bgzip and indexed

gtfgz_path = ''
transcript_fa_path = ''

In [None]:
# set output directory based on gtf path
out_dir = os.path.split(gtfgz_path)[0]

# read gtf and print the first 20 entries
gtfgz = pysam.TabixFile(gtfgz_path)

count = 0
for gtf_line in gtfgz.fetch():
    print(gtf_line)
    if (count := count + 1) > 19:
        break

gtf_feature_count = 0
for gtf_line in gtfgz.fetch():
    gtf_feature_count += 1

In [None]:
# from the gtf, compile matched lists of gene ids, gene names, and gene types for all gene features
gene_list = []
gene_name_list = []
gene_type_list = []

for gtf_line in tqdm(gtfgz.fetch(),total=gtf_feature_count):
    gtf_rec = gtf_line.strip().split('\t')
    if gtf_rec[2] == 'gene':
        gene_id = gtf_rec[8].strip().split('gene_id "')[1].strip().split('"')[0].strip()
        gene_name = gtf_rec[8].strip().split('gene_name "')[1].strip().split('"')[0].strip()
        gene_type = gtf_rec[8].strip().split('gene_type "')[1].strip().split('"')[0].strip() if 'gene_type' in gtf_rec[8] else ''

        gene_list.append(gene_id)
        gene_name_list.append(gene_name)
        gene_type_list.append(gene_type)

# compile lists into a dataframe
gene_map_df = pd.DataFrame({
    'gene_id':gene_list,
    'gene_name':gene_name_list,
    'gene_type':gene_type_list,
})
display(gene_map_df)

In [None]:
# from the gtf, compile matched lists of transcript ids, gene ids, and gene names for all transcript features
transcript_list = []
gene_list = []
gene_name_list = []

for gtf_line in tqdm(gtfgz.fetch(),total=gtf_feature_count):
    gtf_rec = gtf_line.strip().split('\t')
    if gtf_rec[2] == 'transcript':
        transcript_list.append(gtf_rec[8].strip().split('transcript_id "')[1].strip().split('"')[0].strip())
        gene_list.append(gtf_rec[8].strip().split('gene_id "')[1].strip().split('"')[0].strip())
        gene_name_list.append(gtf_rec[8].strip().split('gene_name "')[1].strip().split('"')[0].strip())

# compile the transcript ids and gene ids into map_df and write an otput tsv of two columns with no headers that maps transcript_id to gene_id
# used as a salmon input
map_df = pd.DataFrame({
    'transcript_id':transcript_list,
    'gene_id':gene_list,
})
map_tsv_path = os.path.join(out_dir, 'transcript-gene-map.tsv')
map_df.to_csv(map_tsv_path, index=False, header=False, sep='\t')

# add gene names as a third column to the df
map_df['gene_name'] = gene_name_list
display(map_df)

In [None]:
# from the transcripts fasta, compile matched lists of transcript ids, gene ids, and gene names for all gene transcript features
transcript_contigs_transcript_list = []
transcript_contigs_gene_list = []
transcript_contigs_gene_name_list = []

with gzip.open(transcript_fa_path, 'rt') as fasta:
    for fa_rec in SeqIO.parse(fasta, 'fasta'):
        transcript_contigs_rec = fa_rec.id.strip().split('|')
        transcript_contigs_transcript_list.append(transcript_contigs_rec[0].strip())
        transcript_contigs_gene_list.append(transcript_contigs_rec[1].strip())
        transcript_contigs_gene_name_list.append(transcript_contigs_rec[5].strip())

# compile into dataframe
transcript_contigs_df = pd.DataFrame({
    'transcript_id':transcript_contigs_transcript_list,
    'gene_id':transcript_contigs_gene_list,
    'gene_name':transcript_contigs_gene_name_list,
})
display(transcript_contigs_df)

In [None]:
# verify that gtf and fasta transcripts match, ideally in the output all three counts will be the same and the unique entries dataframes will be empty
print(f'Transcripts from gtf:          \t{len(map_df)}')
print(f'Transcripts from transcript fa:\t{len(transcript_contigs_df)}')
print(f'Transcripts in both:           \t{len(pd.merge(left=map_df,right=transcript_contigs_df,on=['transcript_id'],how='inner'))}')
print(f'Entries unique to transcript fa:\n')
unique_to_fa_df = transcript_contigs_df[~(transcript_contigs_df['transcript_id'].isin(map_df['transcript_id']))]
display(unique_to_fa_df)
print(f'Entries unique to gtf:\n')
unique_to_gtf_df = map_df[~(map_df['transcript_id'].isin(transcript_contigs_df['transcript_id']))]
display(unique_to_gtf_df)

In [None]:
map_tsv_path = os.path.join(out_dir, 'transcript-gene-map-w-names.tsv')
map_df.to_csv(map_tsv_path, index=False, sep='\t')  