In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
data = pd.read_parquet('/projects/bpms/pstjohn/swissprot/parsed_swissprot.parquet')

In [19]:
data.head()

Unnamed: 0,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,Q6GZX4,AY548484,YP_031579.1,vg:2947773,IPR007031,PF04947,654924,256,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,
1,Q6GZX3,AY548484,YP_031580.1,vg:2947774,IPR004251,PF03003,654924,320,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,Host membrane
2,Q197F8,DQ643392,YP_654574.1,vg:4156251,,,345201,458,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,
3,Q197F7,DQ643392,YP_654575.1,vg:4156252,,,345201,156,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,
4,Q6GZX2,AY548484,YP_031581.1,vg:2947775,,,654924,438,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,


In [10]:
data.shape

(561568, 10)

In [41]:
from pathlib import Path
from itertools import islice
Path('/scratch/pstjohn/uniparc/swissprot_gos').mkdir(parents=True, exist_ok=True)

def grouper(iterable, n):
    iterable = iter(iterable)
    while True:
        tup = tuple(islice(iterable, n))
        if tup:
            yield tup
        else:
            break

In [42]:
data_short = data.head(10)
accessions = data_short.accession

In [43]:
import requests
from io import StringIO

def get_GO_api_call(accessions):
    requestURL = f"https://www.ebi.ac.uk/QuickGO/services/annotation/downloadSearch?geneProductId={','.join(accessions)}"
    r = requests.get(requestURL, headers={ "Accept" : "text/tsv"})

    if not r.ok:
        r.raise_for_status()

    responseBody = r.text
    return pd.read_csv(StringIO(responseBody), sep='\t')

In [45]:
accessions

'Q6GZX3'

In [51]:
from tqdm import tqdm

In [None]:
for i, accessions in tqdm(enumerate(grouper(data.accession, 100))):
    df = get_GO_api_call(accessions)
    df.to_csv(f'/scratch/pstjohn/uniparc/swissprot_gos/{i:04d}.csv')

1119it [35:46,  2.08s/it]

In [1]:
import dask.dataframe as dd

In [2]:
godata = dd.read_csv('/scratch/pstjohn/uniparc/swissprot_gos/*.csv')

In [4]:
godata.drop(['Unnamed: 0'], 1).head()

Unnamed: 0,GENE PRODUCT DB,GENE PRODUCT ID,SYMBOL,QUALIFIER,GO TERM,GO ASPECT,ECO ID,GO EVIDENCE CODE,REFERENCE,WITH/FROM,TAXON ID,ASSIGNED BY,ANNOTATION EXTENSION,DATE
0,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571|InterPro:IPR018354,8613,InterPro,,20200613
1,UniProtKB,A2CKF6,A2CKF6,involved_in,GO:0009405,P,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571,8613,InterPro,,20200613
2,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0964,8613,UniProt,,20200613
3,UniProtKB,A2CKF6,A2CKF6,enables,GO:0090729,F,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0800,8613,UniProt,,20200613
4,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0035792,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0629,8613,UniProt,,20200613


In [5]:
godata_df = godata.drop(['Unnamed: 0'], 1).compute()

In [8]:
godata_df.to_parquet('/projects/bpms/pstjohn/swissprot/swissprot_quickgo.parquet')

In [11]:
import pandas as pd

In [16]:
godata_df = pd.read_parquet('/projects/bpms/pstjohn/swissprot/swissprot_quickgo.parquet', engine='pyarrow')

In [18]:
godata_df.head()

Unnamed: 0,GENE PRODUCT DB,GENE PRODUCT ID,SYMBOL,QUALIFIER,GO TERM,GO ASPECT,ECO ID,GO EVIDENCE CODE,REFERENCE,WITH/FROM,TAXON ID,ASSIGNED BY,ANNOTATION EXTENSION,DATE
0,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571|InterPro:IPR018354,8613,InterPro,,20200613
1,UniProtKB,A2CKF6,A2CKF6,involved_in,GO:0009405,P,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571,8613,InterPro,,20200613
2,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0964,8613,UniProt,,20200613
3,UniProtKB,A2CKF6,A2CKF6,enables,GO:0090729,F,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0800,8613,UniProt,,20200613
4,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0035792,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0629,8613,UniProt,,20200613


In [20]:
godata_df['Date'] = pd.to_datetime(godata_df.DATE, format='%Y%m%d')

In [23]:
godata_df['Date'].astype("datetime64")

0      2020-06-13
1      2020-06-13
2      2020-06-13
3      2020-06-13
4      2020-06-13
          ...    
1246   2005-03-22
1247   2005-03-22
1248   2005-03-22
1249   2005-03-22
1250   2005-03-22
Name: Date, Length: 8038735, dtype: datetime64[ns]

In [25]:
count = godata_df.groupby([godata_df["Date"].dt.year, godata_df["Date"].dt.month]).count()

In [32]:
godata_df.head()

Unnamed: 0,GENE PRODUCT DB,GENE PRODUCT ID,SYMBOL,QUALIFIER,GO TERM,GO ASPECT,ECO ID,GO EVIDENCE CODE,REFERENCE,WITH/FROM,TAXON ID,ASSIGNED BY,ANNOTATION EXTENSION,DATE,Date
0,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571|InterPro:IPR018354,8613,InterPro,,20200613,2020-06-13
1,UniProtKB,A2CKF6,A2CKF6,involved_in,GO:0009405,P,ECO:0000256,IEA,GO_REF:0000002,InterPro:IPR003571,8613,InterPro,,20200613,2020-06-13
2,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0005576,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0964,8613,UniProt,,20200613,2020-06-13
3,UniProtKB,A2CKF6,A2CKF6,enables,GO:0090729,F,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0800,8613,UniProt,,20200613,2020-06-13
4,UniProtKB,A2CKF6,A2CKF6,part_of,GO:0035792,C,ECO:0000322,IEA,GO_REF:0000043,UniProtKB-KW:KW-0629,8613,UniProt,,20200613,2020-06-13
