In [6]:
import pandas as pd
from Bio import SeqIO

In [7]:
pacbio = pd.read_csv('../../../data/pacbio_protein_clusters.tsv', sep = '\t')
pacbio['accessions'] = pacbio['pb_accs'].str.split('|')
pacbio['base_acc'] = pacbio['accessions'].apply(lambda x : x[0])
orf = pd.read_csv('../../../data/orf-testset-fraction16.csv')

In [8]:
def get_total(accessions, orf_dict):
    total = 0
    for acc in accessions:
        total += orf_dict[acc]
    return total


In [9]:
fl_dict = pd.Series(orf.FL.values,index=orf.pb_acc).to_dict()
cpm_dict = pd.Series(orf.CPM.values,index=orf.pb_acc).to_dict()

In [10]:
pacbio['FL'] = pacbio['accessions'].apply(lambda accs : get_total(accs, fl_dict))
pacbio['CPM'] = pacbio['accessions'].apply(lambda accs : get_total(accs, cpm_dict))

In [11]:
seqs = SeqIO.parse(open('../../../data/pacbio_protein_clusters.fasta'), 'fasta')

In [73]:
with open("../../../data/pacbio_clusters.fasta", "w") as ofile:
    for entry in seqs:
        seq = str(entry.seq)
        pb_acc = entry.id
        pb_row = pacbio[pacbio['pb_accs'] == pb_acc].iloc[0]
        base_acc = pb_row['base_acc']
        gene = orf[orf['pb_acc'] == base_acc].iloc[0]['gene']
        ofile.write(f">pb|{base_acc}|fullname GN={gene}\n{seq}\n")
        

In [76]:
pacbio = pacbio[['pb_accs', 'base_acc', 'FL', 'CPM']]
pacbio.to_csv("../../../data/pacbio_clusters.tsv", sep = '\t', index = False)

In [12]:
pacbio

Unnamed: 0,protein_sequence,pb_accs,accessions,base_acc,FL,CPM
0,MKELGIWEPLAVKLQTYKTAVETAVLLLRIDDIVSGHKKKGDDQSR...,PB.1091.1,[PB.1091.1],PB.1091.1,2,0.669420
1,MMLNIINSSITTKAISRWSSLACNIALDAVKMVQFEENGRKEIDIK...,PB.1091.10|PB.1091.19,"[PB.1091.10, PB.1091.19]",PB.1091.10,34,11.380147
2,MMKMLLDPMGGIVMTNDGNAILREIQVQHPAAKSMIEISRTQDEEV...,PB.1091.2|PB.1091.9|PB.1091.11|PB.1091.12|PB.1...,"[PB.1091.2, PB.1091.9, PB.1091.11, PB.1091.12,...",PB.1091.2,51,17.070221
3,MMGHRPVLVLSQNTKRESGRKVQSGNINAAKTIADIIRTCLGPKSM...,PB.1091.13,[PB.1091.13],PB.1091.13,24,8.033045
4,MMGHRPVLVLSQNTKRESGRKVQSGNINAAKTIADIIRTCLGPKSM...,PB.1091.14,[PB.1091.14],PB.1091.14,4,1.338841
...,...,...,...,...,...,...
349,MVYMFQYDSTHGKFHGTVKAENGKLVINGNPITIFQERDPSKIKWG...,PB.9726.27|PB.9726.28,"[PB.9726.27, PB.9726.28]",PB.9726.27,117,39.161096
350,MEKAGAHLQGGAKRVIISAPSADAPMFVMGVNHEKYDNSLKIISNA...,PB.9726.29|PB.9726.31,"[PB.9726.29, PB.9726.31]",PB.9726.29,81,27.111528
351,MEEMCFGEATRMVWLPWVYGNLVSLNMVLSPSPPHPHRRDPSKIKW...,PB.9726.30,[PB.9726.30],PB.9726.30,2,0.669420
352,MFVMGVNHEKYDNSLKIISNASCTTNCLAPLAKVIHDNFGIVEGLM...,PB.9726.32,[PB.9726.32],PB.9726.32,35,11.714858


In [3]:
from gtfparse import read_gtf

In [4]:
sg = read_gtf('/mnt/shared/ubuntu/session_data/data/input/jurkat_corrected.gtf')

INFO:root:Extracted GTF attributes: ['transcript_id', 'gene_id']


In [5]:
sg

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,transcript_id,gene_id
0,chr1,hg38_canon,transcript,14361,29347,,-,0,PB.1.1,PB.1.1
1,chr1,hg38_canon,exon,14361,14829,,-,0,PB.1.1,PB.1.1
2,chr1,hg38_canon,exon,14970,15038,,-,0,PB.1.1,PB.1.1
3,chr1,hg38_canon,exon,15796,16765,,-,0,PB.1.1,PB.1.1
4,chr1,hg38_canon,exon,16858,17055,,-,0,PB.1.1,PB.1.1
...,...,...,...,...,...,...,...,...,...,...
1606530,chr4,hg38_canon,exon,49305756,49308982,,+,0,PB.16704.1,PB.16704.1
1606531,chr4,hg38_canon,transcript,49510164,49512886,,-,0,PB.16701.1,PB.16701.1
1606532,chr4,hg38_canon,exon,49510164,49512886,,-,0,PB.16701.1,PB.16701.1
1606533,chr4,hg38_canon,transcript,49510188,49511286,,+,0,PB.16703.1_dup2,PB.16703.1_dup2
