## Schirmer et al. 2003
### Mouse liver

In [46]:
import PyPDF2, re, os, requests, json
import pandas as pd
from time import sleep

### 1. Import the orginal data file PDF

In [2]:
directory = './SourceData/Papers/Schirmer2003/'
filename = 'schirmer.som.table_S8.pdf'
pdf = open(directory + filename, 'rb')

### 2. Read the pdf and extract the whole text

In [3]:
pdf_reader = PyPDF2.PdfFileReader(pdf)
pdf_reader.numPages

15

In [4]:
whole_txt = ''
for page_num in range(pdf_reader.numPages):
    pdf_page = pdf_reader.getPage(page_num)
    pdf_page_txt = pdf_page.extractText()
    whole_txt += pdf_page_txt

### 3. Extract NCBI IDs by regex

In [13]:
# put ref| as a marker of where id is
# Either NP or XP is used
# version number e.g. ".1" can be absent by putting ?

# this is the intitial regex that inculdes the version number
# but realized that inclusion of the version number could hamper ID conversion to Uniprot ID
# regex = re.compile(r'(ref\|)(NP_\d+\.?\d?|XP_\d+\.?\d?)')

# therefore instead extracted only the main body without the version number
regex = re.compile(r'(ref\|)(NP_\d+|XP_\d+)')

In [14]:
# Using regex, extract the IDs from the text and put them to a list
ncbi_id_list = []
for groups in regex.findall(whole_txt):
    ncbi_id = groups[1]
    ncbi_id_list.append(ncbi_id)

In [15]:
len(ncbi_id_list)

109

In [8]:
# below to make sure the number of ids is correct
# split the text to each gene desription using ">gi" as a sign
whole_list = whole_txt.replace('\n', '').split('>gi')
len(whole_list)

110

In [9]:
whole_list[:2]

['Table S8. Amino acid sequences of the putative nuclear transmembrane proteins in fasta format',
 '|7661996|ref|NP_055688.1| KIAA0205 gene product [Homo sapiens]MAITLEEAPWLGWLLVKALMRFAFMVVNNLVAIPSYICYVIILQPLRVLDSKRFWYIEGIMYKWLLGMVASWGWYAGYTVMEWGEDIKAVSKDEAVMLVNHQATGDVCTLMMCLQDKGLVVAQMMWLMDHIFKYTNFGIVSLVHGDFFIRQGRSYRDQQLLLLKKHLENNYRSRDRKWIVLFPEGGFLRKRRETSQAFAKKNNLPFLTNVTLPRSGATKIILNALVAQQKNGSPAGGDAKELDSKSKGLQWIIDTTIAYPKAEPIDIQTWILGYRKPTVTHVHYRIFPIKDVPLETDDLTTWLYQRFVEKEDLLSHFYETGAFPPSKGHKEAVSREMTLSNLWIFLIQSFAFLSGYMWYNIIQYFYHCLF']

### 4. Translate the NCBI list to Uniprot ID

In [17]:
' '.join(ncbi_id_list)

'NP_055688 NP_056473 NP_057086 NP_060531 NP_060557 NP_659471 NP_060221 XP_087089 NP_061900 XP_298567 NP_057688 NP_061891 XP_166338 NP_116231 XP_291222 NP_116117 NP_775961 NP_056348 NP_775857 NP_060634 NP_009107 XP_027330 NP_056158 NP_060640 XP_292696 NP_067038 XP_129726 XP_129627 XP_129651 NP_766431 XP_130243 XP_130605 NP_694766 XP_131120 XP_131247 XP_283952 XP_149561 NP_064382 NP_082631 NP_081113 NP_083003 NP_067278 XP_110572 NP_705798 NP_084313 XP_194313 XP_285910 NP_620096 XP_163618 NP_080922 XP_150109 NP_705820 NP_080950 XP_125636 XP_137321 XP_125972 NP_080293 NP_081625 NP_067421 NP_766088 XP_127498 NP_079663 NP_083179 NP_666167 XP_127968 XP_128377 NP_084221 NP_666084 XP_128762 NP_080446 NP_666187 XP_128954 NP_659110 XP_129087 NP_663471 NP_766428 XP_219484 XP_226713 XP_226886 XP_227591 XP_230334 XP_230798 XP_231194 XP_231568 XP_232987 XP_233684 XP_233702 XP_233726 XP_234304 XP_216756 XP_234559 XP_234768 XP_234929 XP_235233 XP_235527 XP_243957 XP_213272 XP_213339 XP_221193 XP_221266

#### Copied above and coverted them to Uniprot ID on Retrieve/ID mapping (3/20/22)
#### Obtained mapped and unmapped IDs, each of which saved as a XLSX or CSV

In [22]:
df_mapped = pd.read_excel('./SourceData/Papers/Schirmer2003/Mapped.xlsx')
df_unmapped = pd.read_csv('./SourceData/Papers/Schirmer2003/Unmapped.csv')

In [23]:
df_mapped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 7 columns):
 #   Column                                                     Non-Null Count  Dtype 
---  ------                                                     --------------  ----- 
 0   yourlist:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X  62 non-null     object
 1   isomap:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X    19 non-null     object
 2   Entry                                                      62 non-null     object
 3   Entry name                                                 62 non-null     object
 4   Protein names                                              62 non-null     object
 5   Gene names                                                 58 non-null     object
 6   Organism                                                   62 non-null     object
dtypes: object(7)
memory usage: 3.5+ KB


In [24]:
df_unmapped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   not mapped  61 non-null     object
dtypes: object(1)
memory usage: 616.0+ bytes


In [28]:
unmapped_ncbi_ids = list(df_unmapped['not mapped'])
unmapped_ncbi_ids[2:10]

['XP_166338',
 'XP_291222',
 'XP_027330',
 'XP_292696',
 'XP_129726',
 'XP_129627',
 'XP_129651',
 'XP_130243']

#### Quick random check of unmapped IDs on NCBI DB revealed all of examined IDs were 'obsolete' (3/20/22)
#### Thus for now it seems unproductive to further look into this list
#### This could mean the number of the NE proteins found here ends up with far less than 67 as proposed by the paper
#### After all it has been 19 years since then....

#### Let us shift gears and focus on mapped IDs

In [43]:
# drop the unnecessary column
df_mapped = df_mapped.drop(['isomap:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X'], axis=1)

In [44]:
# rename to NCBI ID
df_mapped = df_mapped.rename(columns={'yourlist:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X': 'NCBI ID'})

In [45]:
# remove duplicate
# Entry or Uniprot ID turns out to have no duplicate
# df_mapped_unique = df_mapped.drop_duplicates(subset=['Entry'])
df_mapped_unique = df_mapped.drop_duplicates(subset=['Protein names'])

In [46]:
df_mapped_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 61
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   NCBI ID        56 non-null     object
 1   Entry          56 non-null     object
 2   Entry name     56 non-null     object
 3   Protein names  56 non-null     object
 4   Gene names     53 non-null     object
 5   Organism       56 non-null     object
dtypes: object(6)
memory usage: 3.1+ KB


In [47]:
df_mapped_unique.head()

Unnamed: 0,NCBI ID,Entry,Entry name,Protein names,Gene names,Organism
0,NP_055688,Q92604,LGAT1_HUMAN,Acyl-CoA:lysophosphatidylglycerol acyltransfer...,LPGAT1 FAM34A KIAA0205,Homo sapiens (Human)
1,NP_056473,Q9Y3T9,NOC2L_HUMAN,Nucleolar complex protein 2 homolog (Protein N...,NOC2L NIR,Homo sapiens (Human)
2,NP_057086,Q8NBX0,SCPDL_HUMAN,Saccharopine dehydrogenase-like oxidoreductase...,SCCPDH CGI-49,Homo sapiens (Human)
3,NP_057086,A0A384NPM7,A0A384NPM7_HUMAN,Epididymis secretory sperm binding protein (Sa...,SCCPDH hCG_1782151,Homo sapiens (Human)
4,NP_060531,Q5VTL8,PR38B_HUMAN,Pre-mRNA-splicing factor 38B (Sarcoma antigen ...,PRPF38B,Homo sapiens (Human)


In [2]:
df_mapped_unique = pd.read_csv('./Output/Schirmer2003/Output.csv')

In [3]:
df_mapped_unique.tail()

Unnamed: 0,NCBI ID,Entry,Entry name,Protein names,Gene names,Organism
51,NP_080446,Q4FK22,Q4FK22_MOUSE,1200007D18Rik protein (MAA-136 protein),Ergic1 maa-136,Mus musculus (Mouse)
52,NP_666187,Q6DVA0,LEMD2_MOUSE,LEM domain-containing protein 2 (Nuclear envel...,Lemd2 Lem2,Mus musculus (Mouse)
53,NP_766428,Q70UZ7,VWA2_MOUSE,von Willebrand factor A domain-containing prot...,Vwa2 Amaco,Mus musculus (Mouse)
54,NP_766428,Q8CE01,Q8CE01_MOUSE,Uncharacterized protein,Vwa2 Amaco,Mus musculus (Mouse)
55,XP_233702,G3V7B0,G3V7B0_RAT,Nucleolar protein 9,Nol9 rCG_31415,Rattus norvegicus (Rat)


## 5. Convert mouse and rat Entry to humans'

In [4]:
df_mapped_unique_MR = df_mapped_unique[df_mapped_unique.Organism != 'Homo sapiens (Human)']

In [8]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [49]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

my_dict = {'entry_mr': [], 'gene_name': [], 'entry_h': []}

for entry in df_mapped_unique_MR.Entry:
    try:
        # get response with the query, then the gene name in it
        r = get_url(f'{WEBSITE_API}/search?query=accession:{entry}&fields=gene_names')
        gene_name = r.json()['results'][0]['genes'][0]['geneName']['value']
        sleep(1)

        # find the entry of the human version of the gene
        r2 = get_url(f'{WEBSITE_API}/search?query=gene:{gene_name}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        entry_hs = r2.json()['results'][0]['primaryAccession']
        sleep(1)

    except:
        gene_name = 'Not_found'
        entry_hs = 'Not_found'
    
    my_dict['entry_mr'].append(entry)
    my_dict['gene_name'].append(gene_name)
    my_dict['entry_h'].append(entry_hs)
    

In [59]:
_df = pd.DataFrame(my_dict)
_df = _df.drop_duplicates(subset='entry_h')
_df = _df[_df.entry_h != 'Not_found']
_df = _df.drop(columns=['entry_mr'])
_df = _df.rename(columns={'gene_name':'Gene names', 'entry_h':'Entry'})
_df.head()

Unnamed: 0,Gene names,Entry
0,Tor1aip2,Q8NFQ8
1,Nat10,Q9H0A0
2,Alg2,O75340
3,Ndc1,Q9BTX1
4,Tmem53,Q6P2H8


In [61]:
df_mapped_unique_Hs = df_mapped_unique[df_mapped_unique.Organism == 'Homo sapiens (Human)']
df_mapped_unique_Hs = df_mapped_unique_Hs[['Entry', 'Gene names']]
df = pd.concat([df_mapped_unique_Hs, _df])

In [66]:
df = df.dropna(axis=0)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 0 to 27
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Entry       44 non-null     object
 1   Gene names  44 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


#### Export

In [68]:
df.to_csv('./Output/Schirmer2003/Output.csv', index=False)