# Finding invalid entries in the SIGNOR database

In [1]:
import tqdm
import pandas
import bioregistry

## 1. Load SIGNOR export
This file can be obtained from the web as

```
curl -X POST -d "organism=human&format=csv&submit=Download" -o signor.tsv https://signor.uniroma2.it/download_entity.php
```

In [2]:
df = pandas.read_csv('/Users/ben/.data/indra/signor.tsv', sep='\t')

## 2. Mapping database names to Bioregistry prefixes

In [3]:
db_mappings = {}
for db in set(df.DATABASEA) | set(df.DATABASEB):
    db_mappings[db] = bioregistry.normalize_prefix(db)
db_mappings

{'UNIPROT': 'uniprot',
 'ChEBI': 'chebi',
 'PUBCHEM': 'pubchem.compound',
 'RNAcentral': 'rnacentral',
 'SIGNOR': 'signor',
 'DRUGBANK': 'drugbank'}

## 3. Checking identifiers for invalid entries

We handle a few cases that are technically not canonical but are used 
as conventions in the SIGNOR data, and don't flag these as invalid.

In [4]:
invalidities = []
# Iterate over each row
for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    # Look at both entities in the relationship
    for ent in ('A', 'B'):
        # Get basic data from the row
        db = row[f'DATABASE{ent}']
        prefix = db_mappings[db]
        signor_id = row['SIGNOR_ID']
        ident = row[f'ID{ent}']
        
        # If we have a CHEBI ID, we expect the convention
        # of having a CHEBI: prefix
        if prefix == 'chebi':
            if ident.startswith('CHEBI:'):
                ident = ident[6:]
            else:
                invalidities.append((db, ident, signor_id))
                continue
        
        # If we have a PUBCHEM ID, we expect the convention
        # of having a CID: prefix
        elif prefix == 'pubchem.compound':
            if ident.startswith('CID:'):
                ident = ident[4:]
            else:
                invalidities.append((db, ident, signor_id))
                continue
            
        # If we have a UniProt ID, there are a few conventions
        # to handle:
        # 1. Simple UniProt ID
        # 2. UniProt isoform ID
        # 3. UniProt ID coupled to a chain ID
        # other patterns are considere invalid
        elif prefix == 'uniprot':
            parts = ident.split('-')
            # This is a simple ID
            if len(parts) == 1:
                pass
            # This is either an isoform or a chain
            elif len(parts) == 2:
                # If we're dealing with a chain
                if parts[1].startswith('PRO'):
                    ident = parts[0]
                    pro_valid = bioregistry.is_valid_identifier('uniprot.chain', parts[1])
                    if not pro_valid:
                        invalidities.append((db, part[1], signor_id))
                        continue
                # If we're dealing with an isoform
                else:
                    iso_valid = bioregistry.is_valid_identifier('uniprot.isoform', ident)
                    if not iso_valid:
                        invalidities.append((db, ident, signor_id))
                    continue
            else:
                invalidities.append((db, ident, signor_id))
                continue
        
        # Finally, we do a generic validity check
        valid = bioregistry.is_valid_identifier(prefix, ident)
        if not valid:
            invalidities.append((db, ident, signor_id))
    
    # Now check PMIDs, accepting "Other" as a placeholder
    # We also triage the ID to validate PMC IDs, and NCBI books
    pmid = row['PMID']
    if pmid == 'Other':
        continue
    elif pmid.startswith('PMC'):
        if not bioregistry.is_valid_identifier('pmc', pmid):
            invalidities.append(('PMID', pmid, signor_id))
    elif pmid.startswith('NBK'):
        if not bioregistry.is_valid_identifier('ncbibook', pmid):
            invalidities.append(('PMID', pmid, signor_id))
    elif not bioregistry.is_valid_identifier('pubmed', pmid):
        invalidities.append(('PMID', pmid, signor_id))

# Sort by SIGNOR ID
invalidities = sorted(invalidities, key=lambda x: x[2])

100%|████████████████████████████████████████████████████| 35206/35206 [00:00<00:00, 41354.37it/s]


## 4. Display table of invalid entries

In [5]:
pandas.set_option('display.max_rows', None)
pandas.DataFrame(invalidities, columns=["database", "identifier", "signor_id"])

Unnamed: 0,database,identifier,signor_id
0,PMID,11179217,SIGNOR-255742
1,ChEBI,SID:134445687,SIGNOR-259385
2,ChEBI,SID:134445687,SIGNOR-259386
3,ChEBI,SID:134445687,SIGNOR-259387
4,ChEBI,SID:46508054,SIGNOR-259388
5,ChEBI,SID:46508054,SIGNOR-259389
6,ChEBI,SID:46508054,SIGNOR-259390
7,ChEBI,SID:135317436,SIGNOR-259391
8,ChEBI,SID:125240988,SIGNOR-259392
9,ChEBI,SID:125240988,SIGNOR-259393
