## Schirmer et al. 2003
### Mouse liver

In [1]:
import PyPDF2, re, os, requests, json
import pandas as pd
from time import sleep
from my_utils import get_url, find_duplicate
import my_config

### 1. Import the orginal data file PDF

In [14]:
directory = './SourceData/Papers/Schirmer2003/'
filename = 'schirmer.som.table_S8.pdf'
path = os.path.join(directory, filename)
if os.path.exists(path):
    pdf = open(directory + filename, 'rb')
else:
    print("Path is wrong")

### 2. Read the pdf and extract the whole text

In [15]:
pdf_reader = PyPDF2.PdfReader(pdf)
print(len(pdf_reader.pages))

15


In [16]:
whole_txt = ''
for page_num in range(len(pdf_reader.pages)):
    pdf_page = pdf_reader.pages[page_num]
    pdf_page_txt = pdf_page.extract_text()
    whole_txt += pdf_page_txt

### 3. Extract NCBI IDs by regex

In [17]:
# put ref| as a marker of where id is
# Either NP or XP is used
# version number e.g. ".1" can be absent by putting ?

# this is the intitial regex that inculdes the version number
# but realized that inclusion of the version number could hamper ID conversion to Uniprot ID
# regex = re.compile(r'(ref\|)(NP_\d+\.?\d?|XP_\d+\.?\d?)')

# therefore instead extracted only the main body without the version number
regex = re.compile(r'(ref\|)(NP_\d+|XP_\d+)')

# Using regex, extract the IDs from the text and put them to a list
ncbi_id_list = []
for groups in regex.findall(whole_txt):
    ncbi_id = groups[1]
    ncbi_id_list.append(ncbi_id)

In [18]:
print('Number of the ncbi ID: ', len(ncbi_id_list))

Number of the ncbi ID:  109


#### Below to make sure the number of ids is correct
#### Split the text to each gene desription using ">gi" as a sign

In [16]:
whole_list = whole_txt.replace('\n', '').split('>gi')
print('Number of the ncbi ID: ', len(whole_list))

Number of the ncbi ID:  110


#### What's the extra 1? As seen below it is the table title. So 109 should be the correct number

In [17]:
whole_list[:2]

['Table S8.  Amino acid sequences of the putative nuclear transmembrane proteins in fasta format',
 '|7661996|ref|NP_055688.1| KIAA0205 gene product [Homo sapiens]MAITLEEAPWLGWLLVKALMRFAFMVVNNLVAIPSYICYVIILQPLRVLDSKRFWYIEGIMYKWLLGMVASWGWYAGYTVMEWGEDIKAVSKDEAVMLVNHQATGDVCTLMMCLQDKGLVVAQMMWLMDHIFKYTNFGIVSLVHGDFFIRQGRSYRDQQLLLLKKHLENNYRSRDRKWIVLFPEGGFLRKRRETSQAFAKKNNLPFLTNVTLPRSGATKIILNALVAQQKNGSPAGGDAKELDSKSKGLQWIIDTTIAYPKAEPIDIQTWILGYRKPTVTHVHYRIFPIKDVPLETDDLTTWLYQRFVEKEDLLSHFYETGAFPPSKGHKEAVSREMTLSNLWIFLIQSFAFLSGYMWYNIIQYFYHCLF']

In [18]:
whole_list[-2:]

['|27659542|ref|XP_226578.1| similar to Hypothetical protein KIAA0133 [Rattus norvegicus]MAAVYSGISFKLKSKTTSWEDKLKLAHFAWISHQCFLPNKEQVLLDWARQSLVAFYKKKLELQEDIVERLWVYVDNILHSRRLQNLLKNGKTINLQISLVKIINEKIEEFSLSGSQRSICAILSCCQGILSAPALAVIYTAKPELIVALLSQLCWSACRQPEGAMTAKLFEVIHLALDHYLKLQQQQANPRRVFGDMTGHLFQPCLVLRHLLLGGTWTQASQGQLWQVLSRDIRSKIDAVLRGGVFRYDLLSSYKEELLEQQQENVKMGVLKNLLTPMDTVITRLVEPDYVKSDLHALVVASSVSLLYRLFLDSYLKEENQFLCFQVLPRLFGCLQISHLQEGQMEALSLSDWTTELLAVEQLLNLVATSNIYNVAADRIRHGETQFHFYRRVAELLINHSQASVPAWFRCLKILMSLNHLILEPDLDDLLSSAWIDAEVTEFRAKKAQEVLINTVFQTYAKLRQMPQLFQELLEVICRPAAEALRQPLLASGLSMALCACFLELPLSQILDTWSLVLDKFQSLVMPCLQSDTDMAFKAMSLSSLLHCIMFNMQSLDNNMPLPIIRRTQCMMERMLRELVKPLLGLLLDLWSPEPEPWQQKVSDSALLLSYTWAQVDTTLSLHCSQYYSLAVSLARAALDSSNLPLLLPGVETELWKKVEKCIAQSRSLSRYCLEQLYLQKVKRTLIRTSSQSKEALQTLRFDTAHILDSSRDCLSQKTVTAWDRQVSTMNESLYPVAHWHLIVSNLTVLIPYLCLNDVRYVATVLLRTLPTSKAQGSLAPGESYVTLEKISTALLHSPLFPEMQSLYSAFLTCIIAKCSNILCSGAHNDLSLSQQLPWLFGKDYHTLVAHWETRLAKVGSEGVEPRGEIAQNFLSMVKSGFPIKLHEDQLKDLLELFDVISALHLDSLWP

### 4. Translate the NCBI list to Uniprot ID

In [17]:
' '.join(ncbi_id_list)

'NP_055688 NP_056473 NP_057086 NP_060531 NP_060557 NP_659471 NP_060221 XP_087089 NP_061900 XP_298567 NP_057688 NP_061891 XP_166338 NP_116231 XP_291222 NP_116117 NP_775961 NP_056348 NP_775857 NP_060634 NP_009107 XP_027330 NP_056158 NP_060640 XP_292696 NP_067038 XP_129726 XP_129627 XP_129651 NP_766431 XP_130243 XP_130605 NP_694766 XP_131120 XP_131247 XP_283952 XP_149561 NP_064382 NP_082631 NP_081113 NP_083003 NP_067278 XP_110572 NP_705798 NP_084313 XP_194313 XP_285910 NP_620096 XP_163618 NP_080922 XP_150109 NP_705820 NP_080950 XP_125636 XP_137321 XP_125972 NP_080293 NP_081625 NP_067421 NP_766088 XP_127498 NP_079663 NP_083179 NP_666167 XP_127968 XP_128377 NP_084221 NP_666084 XP_128762 NP_080446 NP_666187 XP_128954 NP_659110 XP_129087 NP_663471 NP_766428 XP_219484 XP_226713 XP_226886 XP_227591 XP_230334 XP_230798 XP_231194 XP_231568 XP_232987 XP_233684 XP_233702 XP_233726 XP_234304 XP_216756 XP_234559 XP_234768 XP_234929 XP_235233 XP_235527 XP_243957 XP_213272 XP_213339 XP_221193 XP_221266

#### Copied above and coverted them to Uniprot ID on Retrieve/ID mapping (3/20/22)
#### Obtained mapped and unmapped IDs, each of which saved as a XLSX or CSV

In [4]:
df_mapped = pd.read_excel('./SourceData/Papers/Schirmer2003/Mapped.xlsx')
df_unmapped = pd.read_csv('./SourceData/Papers/Schirmer2003/Unmapped.csv')

In [5]:
print(f'Mapped IDs: {len(df_mapped)}')
print(f'Unmapped IDs: {len(df_unmapped)}')

Mapped IDs: 62
Unmapped IDs: 61


In [22]:
unmapped_ncbi_ids = list(df_unmapped['not mapped'])
unmapped_ncbi_ids[2:10]

['XP_166338',
 'XP_291222',
 'XP_027330',
 'XP_292696',
 'XP_129726',
 'XP_129627',
 'XP_129651',
 'XP_130243']

#### Quick random check of unmapped IDs on NCBI DB revealed all of examined IDs were 'obsolete' (3/20/22)
#### For now it seems unproductive to further look into these unmapped genes
#### Let us focus on mapped IDs

In [6]:
# drop the unnecessary column
df_mapped = df_mapped.drop(['isomap:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X'], axis=1)

# rename to NCBI ID
df_mapped = df_mapped.rename(columns={'yourlist:M202203204ABAA9BC7178C81CEBC9459510EDDEA346CE44X': 'NCBI ID'})

## 5. Convert mouse and rat Entry to humans'

In [7]:
df_mapped_nonhuman = df_mapped[df_mapped.Organism != 'Homo sapiens (Human)']
print('Non-human entries: ', len(df_mapped_nonhuman))

Non-human entries:  35


In [8]:
my_dict = {'entry_mr': [], 'gene_name': [], 'entry_h': []}

for entry in df_mapped_nonhuman.Entry:
    # default values 
    gene_name = entry_hs = 'Not_found' 
    
    try:
        # get response with the query, then the gene name in it
        params = {
        "query": f'accession:{entry}',
        "fields": "gene_names",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        gene_name = r.json()['results'][0]['genes'][0]['geneName']['value']
        sleep(1)

        # find the entry of the human version of the gene
        params = {
        "query": f'gene:{gene_name} AND organism_id:{my_config.organism_id_list["Homo sapiens"]}',
        "fields": "accession, gene_names",
        "format": "json"
        }
        
        r2 = get_url(my_config.WEBSITE_API, params=params)
        entry_hs = r2.json()['results'][0]['primaryAccession']
        sleep(1)

    except Exception as e:
        print(f'Error in fetching {entry}: {e}') 
    
    if len(my_dict['entry_mr']) % 10 == 0:
        print(entry, gene_name, entry_hs)
    
    my_dict['entry_mr'].append(entry)
    my_dict['gene_name'].append(gene_name)
    my_dict['entry_h'].append(entry_hs)
    
_df = pd.DataFrame(my_dict)

Q8BYU6 Tor1aip2 Q8NFQ8
Q05DT5 Mospd3 O75425
Error in fetching Q9DCZ9: list index out of range
Q4FZC9 Syne3 Q6ZMZ3
Q8VDS4 Rprd1a Q96P16


#### What are not found genes?
#### Uniprot search revealed this gene exists only in mouse

In [9]:
_df[_df.entry_h == 'Not_found']

Unnamed: 0,entry_mr,gene_name,entry_h
16,Q9DCZ9,Aph1c,Not_found


#### Any duplicates between mouse and rat?

In [10]:
print(find_duplicate(_df.entry_h.to_list()))

['Q9Y3T9', 'O75425', 'O75425', 'Q9H936', 'Q6NW34', 'Q969X5', 'Q5GFL6', 'Q5SY16']


In [11]:
# Cleanup
_df_cleaned = _df.drop_duplicates(subset='entry_h')
_df_cleaned = _df_cleaned.drop(columns=['entry_mr'])
_df_cleaned = _df_cleaned.rename(columns={'gene_name':'Gene names', 'entry_h':'Entry'})
_df_cleaned = _df_cleaned[_df_cleaned.Entry != 'Not_found']

In [12]:
print('Number of obtained IDs: ', len(_df_cleaned.Entry.to_list()))

Number of obtained IDs:  26


#### Combine the converted df with the original Human df

In [13]:
df_mapped_human = df_mapped[df_mapped.Organism == 'Homo sapiens (Human)']
df_mapped_human = df_mapped_human[['Entry', 'Gene names']]
df = pd.concat([df_mapped_human, _df_cleaned])

In [14]:
print('UniprotID duplicate: ', find_duplicate(df.Entry.to_list()))
print('Gene name duplicate: ', find_duplicate(df['Gene names'].to_list()))

UniprotID duplicate:  ['Q8NFQ8', 'Q9BTX1', 'Q9Y3T9', 'Q9NVM9', 'O95476', 'Q9UKR5', 'Q9NXE4', 'Q96P16', 'Q3LXA3']
Gene name duplicate:  [nan, nan, nan]


#### Are those duplicates between human and mouse/rat?

In [15]:
for duplicate in find_duplicate(df.Entry.to_list()):
    print(df[df.Entry == duplicate].iloc[:, :])

    Entry             Gene names
6  Q8NFQ8  TOR1AIP2 IFRG15 LULL1
0  Q8NFQ8               Tor1aip2
    Entry   Gene names
5  Q9BTX1  NDC1 TMEM48
3  Q9BTX1         Ndc1
    Entry Gene names
1  Q9Y3T9  NOC2L NIR
6  Q9Y3T9      Noc2l
     Entry                 Gene names
19  Q9NVM9  INTS13 ASUN C12orf11 GCT1
12  Q9NVM9                     IntS13
     Entry       Gene names
23  O95476  CTDNEP1 DULLARD
17  O95476          Ctdnep1
     Entry                          Gene names
20  Q9UKR5  ERG28 C14orf1 AD-011 HSPC288 x0006
19  Q9UKR5                               Erg28
     Entry           Gene names
7   Q9NXE4  SMPD4 KIAA1418 SKNY
24  Q9NXE4                Smpd4
     Entry    Gene names
24  Q96P16  RPRD1A P15RS
30  Q96P16        Rprd1a
     Entry Gene names
16  Q3LXA3   TKFC DAK
31  Q3LXA3       Tkfc


#### It seems the duplicates happen between original human proteins and human-coverted mouse/rat proteins
#### It should be ok to remove those duplicates

In [18]:
df_unique = df.drop_duplicates(subset=['Entry'])

#### What are entries with not gene name found?

In [19]:
df_unique[df_unique['Gene names'].isna()]

Unnamed: 0,Entry,Gene names
12,A0A140VJX5,
14,A0A384NPM3,
17,A0A140VJH7,
22,Q86TW5,


#### They are all non-reviewed, transcription only entries
#### Make them "Unknown" and keep them for now

In [20]:
df_unique = df_unique.fillna('Unknown')

In [22]:
print('Number of genes: ', len(df_unique))

Number of genes:  44


#### Export

In [23]:
df_unique.to_csv('./Output/Schirmer2003/Output.csv', index=False)