In [1]:
import pandas as pd
from my_utils import find_duplicate

# 1. Datasets import

In [22]:
df_schirmer2003 = pd.read_csv('./Output/Schirmer2003/Output.csv')
df_korfali2010 = pd.read_csv('./Output/Korfali_2010.csv')
df_wilkie2010 = pd.read_csv('./Output/Wilkie_2010.csv')
df_korfali2012 = pd.read_csv('./Output/Korfali2012/Korfali2012_Hs.csv')
df_cheng2019 = pd.read_csv('./Output/Cheng2019.csv')

# 2. Merge datasets

## 2-1. Schirmer 2003 + Korfali 2010

In [23]:
df = df_schirmer2003.merge(df_korfali2010, left_on='Entry', right_on='Entry_Korfali_2010', how='outer')

In [24]:
df['UniprotID'] = df.Entry.fillna(df.Entry_Korfali_2010)
df['GeneName'] = df['Gene names'].fillna(df.Gene_name_obtained)
df.columns

Index(['Entry', 'Gene names', 'Entry_Korfali_2010', 'Gene_name_obtained',
       'UniprotID', 'GeneName'],
      dtype='object')

In [25]:
df = df[['UniprotID', 'GeneName', 'Entry', 'Entry_Korfali_2010']]

# Make a binary mark for presence/absence of record in each source paper
df.fillna(0, inplace=True)
df.Entry = df.Entry.apply(lambda x: 1 if x != 0 else 0).astype(pd.Int8Dtype())
df.Entry_Korfali_2010 = df.Entry_Korfali_2010.apply(lambda x: 1 if x != 0 else 0).astype(pd.Int8Dtype())
df = df.rename(columns={'Entry':'Schirmer_2003', 'Entry_Korfali_2010':'Korfali_2010'})

## 2-2. Add Wilkie 2010

In [26]:
df = df.merge(df_wilkie2010, left_on='UniprotID', right_on='Entry_Wilkie', how='outer')

In [27]:
df.UniprotID = df.UniprotID.fillna(df.Entry_Wilkie)
df.GeneName = df.GeneName.fillna(df.Gene_name_obtained)

# Make a binary mark for presence/absence of record in the source paper
df.fillna(0, inplace=True)
df.Entry_Wilkie = df.Entry_Wilkie.apply(lambda x: 1 if x != 0 else 0).astype(pd.Int8Dtype())

In [28]:
df = df.drop(columns=['Gene_name_obtained'], axis=1)
df = df.rename(columns={'Entry_Wilkie': 'Wilkie_2010'})

## 2-3. Add Korfali 2012

In [29]:
df = df.merge(df_korfali2012, left_on='UniprotID', right_on='Uniprot_id', how='outer')

In [30]:
df.UniprotID = df.UniprotID.fillna(df.Uniprot_id)
df.GeneName = df.GeneName.fillna(df['gene name'])

# Make a binary mark for presence/absence of record in the source paper
df.fillna(0, inplace=True)
df.Uniprot_id = df.Uniprot_id.apply(lambda x: 1 if x != 0 else 0).astype(pd.Int8Dtype())

In [31]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Unnamed: 0', 'tissue', 'gene name', 'alternate names',
       'accession numbers', 'NE:MM ratio by dNSAF', 'reference', 'Uniprot_id'],
      dtype='object')

In [32]:
df = df[['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
           'Uniprot_id', 'NE:MM ratio by dNSAF']]
df = df.rename(columns={'Uniprot_id':'Korfali_2012', 'NE:MM ratio by dNSAF':'Korfali_2012_NE:MM-ratio'})
df['Korfali_2012_NE:MM-ratio'] = df['Korfali_2012_NE:MM-ratio'].astype('float64')

## 2-4. Add Cheng 2019

In [33]:
df = df.merge(df_cheng2019, left_on='UniprotID', right_on='entry_h', how='outer')

In [34]:
df.UniprotID = df.UniprotID.fillna(df.entry_h)
df.GeneName = df.GeneName.fillna(df.gene_name)

# Make a binary mark for presence/absence of record in the source paper
df.fillna(0, inplace=True)
df.entry_h = df.entry_h.apply(lambda x: 1 if x != 0 else 0).astype(pd.Int8Dtype())

In [35]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Korfali_2012_NE:MM-ratio', 'gene_name', 'entry_h',
       'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M'],
      dtype='object')

In [36]:
df = df.rename(columns={'entry_h':'Cheng_2019', 'NE Enrich Score in U': 'Cheng_2019_Score:Undiff', 'NE Enrich Score in A': 'Cheng_2019_Score:Adipo', 'NE Enrich Score in M': 'Cheng_2019_Score:Myo'})
df = df.drop(columns=['gene_name'], axis=1)

## 2-5. Scoring proteins by the number of papers

In [37]:
df['#ProteomePapers'] = df.Schirmer_2003 + df.Korfali_2010 + df.Wilkie_2010 + df.Korfali_2012 + df.Cheng_2019

In [38]:
# See the number of proteins that support 
counts = df['#ProteomePapers'].value_counts()
print(counts)

1    291
2     52
3     35
4     26
5      6
Name: #ProteomePapers, dtype: Int64


In [39]:
# See which papers comes in the 287 singles
for papername in ['Schirmer_2003', 'Korfali_2010', 'Wilkie_2010','Korfali_2012', 'Cheng_2019']:
    numberOfPapers = len(df[(df['#ProteomePapers'] == 1) & (df[papername] == 1)])
    print(f'{papername}: {numberOfPapers}')

Schirmer_2003: 11
Korfali_2010: 46
Wilkie_2010: 14
Korfali_2012: 24
Cheng_2019: 196


In [40]:
print(f'The total number of nuclear membrane proteins from the 5 proteome papers: {len(df)}')

The total number of nuclear membrane proteins from the 5 proteome papers: 410


In [42]:
df.to_csv('./Output/ProteomicsPapersMerged.csv', index=False)