In [1]:
import pandas as pd

### 1. Clean up MemBrain proteins

In [2]:
df_MB = pd.read_csv('./IntermediateProducts/Results_step_3.csv')

In [4]:
# Remove Not_found or Not_found(similar)
df_MB = df_MB[(df_MB.Entry_Hs != "Not_found") & (df_MB.Entry_Hs != "Not_found(similar)")]

In [6]:
df_MB = df_MB.drop(columns=['SubCell_Uniprot'])

In [16]:
# Remove duplicates
# Make sure Human entries come first by placing 1 to them and 2 to the others and sorting
# This way a row with Hs ID in Entry_original will be retained
# a row is removed if it had a non-human ID originally that was converted to the same Hs entry
df_MB['Human?'] = df_MB['Organism'].apply(lambda x: 1 if x == 'Homo sapiens' else 2)
df_MB_sorted = df_MB.sort_values(by='Human?')
df_MB_cleaned = df_MB_sorted.drop_duplicates(subset='Entry_Hs')
df_MB_cleaned.info()

In [30]:
print("Total number of human proteins in MemBrain: ", len(df_MB.Entry_Hs.unique()))

Total number of human proteins in MemBrain:  1865


### 2. Merge MemBrain and NE proteins

In [48]:
df_NE = pd.read_csv('./SourceData/Final_NE_proteins.csv')

In [49]:
df_NE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281 entries, 0 to 280
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   UniprotID                 281 non-null    object 
 1   GeneName                  281 non-null    object 
 2   Schirmer_2003             281 non-null    float64
 3   Korfali_2010              281 non-null    float64
 4   Wilkie_2010               281 non-null    float64
 5   Korfali_2012              281 non-null    float64
 6   Korfali_2012_NE:MM-ratio  281 non-null    float64
 7   Cheng_2019                281 non-null    float64
 8   Cheng_2019_Score:Undiff   281 non-null    float64
 9   Cheng_2019_Score:Adipo    281 non-null    float64
 10  Cheng_2019_Score:Myo      281 non-null    float64
 11  #ProteomePapers           281 non-null    float64
 12  HPA                       281 non-null    int64  
 13  HPA_reliability           281 non-null    object 
 14  HPA_loc   

In [60]:
df_merged = df_MB_cleaned.merge(df_NE, left_on='Entry_Hs', right_on='UniprotID', how='inner')

In [61]:
df_merged = df_merged[['Entry_original', 'Organism', 'Entry_Hs', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction',
       'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Korfali_2012_NE:MM-ratio', 'Cheng_2019',
       'Cheng_2019_Score:Undiff', 'Cheng_2019_Score:Adipo',
       'Cheng_2019_Score:Myo', '#ProteomePapers', 'HPA', 'HPA_reliability',
       'HPA_loc', 'Uniprot_NEorER', 'Uniprot_subcell', 'Results']]

In [58]:
print("NE proteins with MemBrain result: ", len(df_merged))

NE proteins with MemBrain result:  120


In [63]:
df_NE_AH = df_merged[df_merged.AH_or_Not == 'AH']

print("NE proteins with predicted AH: ", len(df_NE_AH))
print("Breakdown: ")

for i in [1,2,3,4,5]:
    print(f"  {len(df_NE_AH[df_NE_AH['#ProteomePapers'] == i])} proteins are supoorted by {i} proteome papers")

NE proteins with predicted AH:  87
Breakdown: 
  48 proteins are supoorted by 1 proteome papers
  10 proteins are supoorted by 2 proteome papers
  12 proteins are supoorted by 3 proteome papers
  15 proteins are supoorted by 4 proteome papers
  2 proteins are supoorted by 5 proteome papers


In [64]:
df_NE_NonAH = df_merged[df_merged.AH_or_Not == 'Non-AH']

print("NE proteins without predicted AH: ", len(df_NE_NonAH))
print("Breakdown: ")

for i in [1,2,3,4,5]:
    print(f"  {len(df_NE_NonAH[df_NE_NonAH['#ProteomePapers'] == i])} proteins are supoorted by {i} proteome papers")

NE proteins without predicted AH:  33
Breakdown: 
  21 proteins are supoorted by 1 proteome papers
  6 proteins are supoorted by 2 proteome papers
  3 proteins are supoorted by 3 proteome papers
  2 proteins are supoorted by 4 proteome papers
  1 proteins are supoorted by 5 proteome papers
