In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [3]:
cafa3_dir = '/gpfs/alpine/bie108/proj-shared/cafa3/'

In [4]:
from itertools import groupby

def fasta_iter():
    """
    given a fasta file. yield tuples of header, sequence
    """
    for fasta in os.listdir(os.path.join(cafa3_dir, 'Target files')):
        with open(os.path.join(cafa3_dir, 'Target files', fasta), 'r') as fh:
            faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
            for header in faiter:
                headerStr = header.__next__()[1:].strip()
                seq = "".join(s.strip() for s in faiter.__next__())
                yield headerStr, seq
            
seq_df = pd.DataFrame(fasta_iter(), columns=['accession', 'sequence'])
seq_df.shape

(130827, 2)

In [5]:
seq_df = seq_df.join(seq_df['accession'].str.extract(r'(?P<CAFA3ID>.+)\ (?P<GeneID>.*)')).drop('accession', 1)

In [7]:
target_go_bp = pd.read_csv(os.path.join(cafa3_dir, 'target_groundtruth', 'leafonly_BPO.txt'), sep='\t', header=None)
target_go_cc = pd.read_csv(os.path.join(cafa3_dir, 'target_groundtruth', 'leafonly_CCO.txt'), sep='\t', header=None)
target_go_mf = pd.read_csv(os.path.join(cafa3_dir, 'target_groundtruth', 'leafonly_MFO.txt'), sep='\t', header=None)

target_go_labels = pd.concat([target_go_bp, target_go_cc, target_go_mf]).rename(columns={0: 'CAFA3ID', 1: 'GO TERM'})

In [8]:
len(target_go_labels['CAFA3ID'].unique())

3328

In [17]:
target_go_labels

Unnamed: 0,CAFA3ID,GO TERM
0,T100900005305,GO:0033234
1,T100900010085,GO:0006468
2,T100900010085,GO:0046777
3,T100900003996,GO:0001782
4,T100900003996,GO:0002315
5,T100900003996,GO:0043547
6,T100900003996,GO:0051491
7,T100900008741,GO:1990830
8,T100900008741,GO:0045944
9,T100900002736,GO:0070293


In [12]:
seq_df_subset = seq_df[seq_df['CAFA3ID'].isin(target_go_labels['CAFA3ID'])]

In [13]:
seq_df_subset.to_csv('cafa3_annotated_targets.csv.gz', index=False, compression='gzip')

In [14]:
swissprot_dir = '/gpfs/alpine/bie108/proj-shared/swissprot/'
swissprot = pd.read_parquet(os.path.join(swissprot_dir, 'parsed_swissprot_uniref_clusters.parquet'))
swissprot.head()

Unnamed: 0,UniRef100 ID,UniRef90 ID,UniRef50 ID,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,UniRef100_Q9Q8J2,UniRef90_Q9Q8J2,UniRef50_P16712,Q9Q8J2,AF170726,NP_051822.1,vg:932054,IPR027417,PF04851,31530,478,MSVCSEIDYALYTELKKFLNSQPLFLFNADKNFVEVVPSSSFKFYI...,Virion
1,UniRef100_P14197,UniRef90_P14197,UniRef50_P14197,P14197,X16524,XP_643326.1,ddi:DDB_G0276031,IPR036322,PF00400,44689,478,MGSRLNPSSNMYIPMNGPRGGYYGMPSMGQLQHPLFNYQFPPGGFQ...,
2,UniRef100_A6VUT8,UniRef90_A6VUT8,UniRef50_Q65UI5,A6VUT8,CP000749,WP_012069002.1,mmw:Mmwyl1_1288,IPR011763,PF03255,400668,315,MNLDYLPFEQPIAELEQKIEELRLVGNDNELNISDEISRLEDKKIA...,Cytoplasm
3,UniRef100_A4QKB4,UniRef90_P56765,UniRef50_P56765,A4QKB4,AP009370,YP_001123295.1,,IPR011762,PF01039,50458,487,MEKSWFNLMFSKGELEYRGELSKAMDSFAPSEKTTISQDRFIYDMD...,Plastid
4,UniRef100_Q9SQR4,UniRef90_Q9SQR4,UniRef50_Q9SQR4,Q9SQR4,CP002686,NP_187048.1,ath:AT3G03980,IPR002347,,3702,270,MSTHSSISQPPLPLAGRVAIVTGSSRGIGRAIAIHLAELGARIVIN...,Plastid


In [15]:
seq_df_subset.sequence.isin(swissprot.sequence).sum() / len(seq_df_subset)

0.9444110576923077

In [16]:
seq_df_subset[~seq_df_subset.sequence.isin(swissprot.sequence)].iloc[0].sequence

'MESNKDEAERCISIALKAIQSNQPDRALRFLEKAQRLYPTPRVRALIESLNQKPQTAGDQPPPTDTTHATHRKAGGTDAPSANGEAGGESTKGYTAEQVAAVKRVKQCKDYYEILGVSRGASDEDLKKAYRRLALKFHPDKNHAPGATEAFKAIGTAYAVLSNPEKRKQYDQFGDDKSQAARHGHGHGDFHRGFEADISPEDLFNMFFGGGFPSSNVHVYSNGRMRYTYQQRQDRRDNQGDGGLGVFVQLMPILILILVSALSQLMVSSPPYSLSPRPSVGHIHRRVTDHLGVVYYVGDTFSEEYTGSSLKTVERNVEDDYIANLRNNCWKEKQQSEGLLYRARYFGDTDMYHRAQKMGTPSCSRLSEVQASLHG'