In [1]:
import requests
import pandas as pd
import ast
import json
from my_utils import get_url, get_source_pmid
import my_config

In [23]:
df = pd.read_csv('./Output/UniprotSubcellResults.csv')

In [3]:
NE_list = ['Nucleus outer membrane','Nucleus membrane','Nucleus inner membrane','Nucleus, nuclear pore complex', 'Nucleus envelope', 'Nucleus lamina']
ER_list = ['Endoplasmic reticulum membrane','Endoplasmic reticulum','Sarcoplasmic reticulum membrane',
            'Endoplasmic reticulum-Golgi intermediate compartment membrane', 'Endoplasmic reticulum lumen']
NE_ER_list = NE_list + ER_list

good_evidence = ['ECO:0000269', # experimental
                 'ECO:0000305', # inferrence from paper
                 'ECO:0000250', # seq similarity
                 'ECO:0000255', # seq model
                 'ECO:0000312', # imported from other database
                 'ECO:0007744'] # a combination of experimental and computational evidence

five_papers = ['12958361', # Schirmer 2003
               '20693407', # Korfali 2010
               '20876400', # Wilkie 2010
               '22990521', # Korfali 2012
               '31142202'] # Cheng 2019

In [24]:
for i in range(len(df)):
    # log
    if i % 100 == 0: print("Starting ", i)
    
    result = df.iloc[i, 1]
    result = ast.literal_eval(result)

    # Store location names and judging if it is reliablly NE/ER
    locations_list = []
    judge_list = []
    if 'comments' in result:
        for comment in result['comments']: # Each comment contains information for each isoform, if any
            if comment['commentType'] == 'SUBCELLULAR LOCATION':
                locs = comment.get('subcellularLocations', [])

                for loc in locs:
                    if 'location' in loc:
                        # For judging this specifc location
                        location_judge = 0
                        evidence_judge = 0

                        # Store evidence code and pmid for this location
                        # Used for judging and not exported outside the loc loop
                        code_list = []
                        pmid_list = list()

                        value = loc['location'].get('value', '')
                        evidences = loc['location'].get('evidences', '')

                        # Loop over all evidences
                        for evidence in evidences:
                            code = evidence.get('evidenceCode', '')
                            pmid = evidence.get('id', '')

                            # if reliable evidence is present
                            # then the evidence is re-written to be valid
                            if code in good_evidence:
                                evidence_judge = 1

                            # if the evidence is "similarity"
                            # then the source pmid is pulled
                            if (code == "ECO:0000250") & (len(pmid) > 0): # make sure the entry(pmid) exists
                                pmid = get_source_pmid(pmid, value)

                            code_list.append(code)
                            # Append the give PMIDs depending on type of pmid
                            # In default it is a string
                            # But if it was updated by "Similarity" then it is a list
                            if type(pmid) == str: pmid_list.append(pmid)
                            if type(pmid) == list: pmid_list += pmid

                        # if the location evidence comes only from any of 5 papers
                        # then the evidence is considered invalid
                        # False if not is placed because pmid_list can be empty and that returns True with all statement
                        if (len(evidences) == 1) & bool(pmid_list) & all(item in five_papers for item in pmid_list):
                            evidence_judge = 0
                            print(f"For {i}, {value}, {pmid_list}, evidence was only from 5 papers")

                        # Final judge for this location
                        if (value in NE_ER_list) & evidence_judge == 1:
                            location_judge = 1

                    # the end of loc loop, appending the location and final judge
                    locations_list.append(value)
                    judge_list.append(location_judge)
    
    # Final judge
    # If there is at least one "1" in judge_list
    # then this protein NE/ER is suported by Uniprot but not by 5 papers
    if 1 in judge_list:
        final_judge = 1
    else: final_judge = 0
    
    df.loc[i, "Uniprot_NEorER"] = final_judge
    df.loc[i, "Uniprot_subcell"] = ", ".join(locations_list)

Starting  0
For 82, Nucleus inner membrane, ['31142202'], evidence was only from 5 papers
For 83, Nucleus inner membrane, ['31142202'], evidence was only from 5 papers
Starting  100
Starting  200
Starting  300
Starting  400
For 407, Nucleus envelope, ['31142202'], evidence was only from 5 papers
For 408, Nucleus inner membrane, ['31142202'], evidence was only from 5 papers


In [26]:
df.to_csv('./Uniprot_judgements.csv', index=False)