In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
data = pd.read_parquet('/projects/bpms/pstjohn/swissprot/parsed_swissprot.parquet')

In [3]:
data.head()

Unnamed: 0,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,Q6GZX4,AY548484,YP_031579.1,vg:2947773,IPR007031,PF04947,654924,256,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,
1,Q6GZX3,AY548484,YP_031580.1,vg:2947774,IPR004251,PF03003,654924,320,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,Host membrane
2,Q197F8,DQ643392,YP_654574.1,vg:4156251,,,345201,458,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,
3,Q197F7,DQ643392,YP_654575.1,vg:4156252,,,345201,156,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,
4,Q6GZX2,AY548484,YP_031581.1,vg:2947775,,,654924,438,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,


In [4]:
subcell = data[data['subcellularLocalization'] != ''].loc[:, ['accession', 'subcellularLocalization']]

In [5]:
lens = [len(item.split('\n')) for item in subcell['subcellularLocalization']]

In [6]:
expanded_subcell = pd.DataFrame(
    {"accession" : np.repeat(subcell['accession'].values, lens), 
     "subcellularLocalization" : np.hstack(subcell['subcellularLocalization'].str.split('\n'))})

In [7]:
expanded_subcell.subcellularLocalization.value_counts().head(50)

Cytoplasm                              166716
Nucleus                                 40576
Cell membrane                           34422
Secreted                                31905
Cell inner membrane                     26878
Plastid                                 15903
Membrane                                15042
Endoplasmic reticulum membrane           6651
Mitochondrion inner membrane             6472
Virion                                   5749
Mitochondrion                            5646
Cell projection                          5496
Host cytoplasm                           4446
Host nucleus                             4190
Chromosome                               4031
Cell junction                            4014
Periplasm                                3099
Golgi apparatus membrane                 3084
Golgi apparatus                          2945
Cytoplasmic vesicle                      2930
Virion membrane                          2743
Cell outer membrane               

In [8]:
one_hot = pd.get_dummies(expanded_subcell.subcellularLocalization)
one_hot['accession'] = expanded_subcell['accession']
grouped_one_hot = one_hot.groupby('accession').sum()

In [9]:
subset = grouped_one_hot.loc[:, grouped_one_hot.sum(0) > 4000]

In [10]:
subset.shape

(344004, 16)

In [19]:
test = subset.sample(10000, random_state=1)
valid = subset[~subset.index.isin(test.index)].sample(10000, random_state=1)
train = subset[~subset.index.isin(test.index) & ~subset.index.isin(valid.index)]

In [21]:
train.to_csv('/projects/bpms/pstjohn/swissprot/subcellular/train.csv.gz', compression='gzip')
valid.to_csv('/projects/bpms/pstjohn/swissprot/subcellular/valid.csv.gz', compression='gzip')
test.to_csv('/projects/bpms/pstjohn/swissprot/subcellular/test.csv.gz', compression='gzip')