In [1]:
import pandas as pd
from my_utils import find_AH

### 1. Clean up MemBrain proteins

In [2]:
df_MB = pd.read_csv('./IntermediateProducts/Results_step_3.csv')

# Remove Not_found or Not_found(similar)
df_MB = df_MB[(df_MB.Entry_Hs != "Not_found") & (df_MB.Entry_Hs != "Not_found(similar)")]

df_MB = df_MB.drop(columns=['SubCell_Uniprot'])

# Remove duplicates
# Make sure Human entries come first by placing 1 to them and 2 to the others and sorting
# This way a row with Hs ID in Entry_original will be retained
# a row is removed if it had a non-human ID originally that was converted to the same Hs entry
df_MB['Human?'] = df_MB['Organism'].apply(lambda x: 1 if x == 'Homo sapiens' else 2)
df_MB_sorted = df_MB.sort_values(by='Human?')
df_MB_cleaned = df_MB_sorted.drop_duplicates(subset='Entry_Hs')

print("Total number of human proteins in MemBrain: ", len(df_MB.Entry_Hs.unique()))

Total number of human proteins in MemBrain:  1865


In [3]:
df_MB_cleaned.to_csv('./FinalOutput/MemBrain_Hs_cleaned.csv', index=False)

### 2. Merge MemBrain and NE proteins

In [24]:
# NE protein list copied from Nuclear Proteome respository
df_NE = pd.read_csv('./SourceData/Final_NE_proteins.csv')

# Merge
df_merged = df_MB_cleaned.merge(df_NE, left_on='Entry_Hs', right_on='UniprotID', how='inner')

# Select columns
df_merged = df_merged[['Entry_original', 'Organism', 'Entry_Hs', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction',
       'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Korfali_2012_NE:MM-ratio', 'Cheng_2019',
       'Cheng_2019_Score:Undiff', 'Cheng_2019_Score:Adipo',
       'Cheng_2019_Score:Myo', '#ProteomePapers', 'HPA', 'HPA_reliability',
       'HPA_loc', 'Uniprot_NEorER', 'Uniprot_subcell', 'Results']]

print("NE proteins with MemBrain result: ", len(df_merged))

NE proteins with MemBrain result:  120


In [29]:
df_NE_AH = df_merged[df_merged.AH_or_Not == 'AH'].copy()

print("NE proteins with predicted AH: ", len(df_NE_AH))
print("Breakdown: ")

for i in [1,2,3,4,5]:
    print(f"  {len(df_NE_AH[df_NE_AH['#ProteomePapers'] == i])} proteins are supoorted by {i} proteome papers")

NE proteins with predicted AH:  87
Breakdown: 
  48 proteins are supoorted by 1 proteome papers
  10 proteins are supoorted by 2 proteome papers
  12 proteins are supoorted by 3 proteome papers
  15 proteins are supoorted by 4 proteome papers
  2 proteins are supoorted by 5 proteome papers


In [26]:
df_NE_NonAH = df_merged[df_merged.AH_or_Not == 'Non-AH']

print("NE proteins without predicted AH: ", len(df_NE_NonAH))
print("Breakdown: ")

for i in [1,2,3,4,5]:
    print(f"  {len(df_NE_NonAH[df_NE_NonAH['#ProteomePapers'] == i])} proteins are supoorted by {i} proteome papers")

NE proteins without predicted AH:  33
Breakdown: 
  21 proteins are supoorted by 1 proteome papers
  6 proteins are supoorted by 2 proteome papers
  3 proteins are supoorted by 3 proteome papers
  2 proteins are supoorted by 4 proteome papers
  1 proteins are supoorted by 5 proteome papers


In [41]:
df_NE_NonAH.to_csv('./FinalOutput/Non-AH_NEproteins.csv', index=False)

### 3. AH analysis and export

In [62]:
# Make a dict with AH residues and its location like {'RMIAQKISVR': '289-298'}
df_NE_AH.loc[:, 'Predicted_AHs'] = df_NE_AH.apply(lambda x: find_AH(x['AA_sequence'], x['Prediction']), axis=1)
# Count the number of AHs per protein
df_NE_AH.loc[:, '#ofAHs'] = df_NE_AH.Predicted_AHs.apply(len)

# How many proteins have how many predicted AHs?
for i in [1, 2, 3]:
    print(f"{len(df_NE_AH[df_NE_AH['#ofAHs'] == i])} proteins are predicted to have {i} AH")

63 proteins are predicted to have 1 AH
24 proteins are predicted to have 2 AH
0 proteins are predicted to have 3 AH


In [64]:
df_NE_AH = df_NE_AH[['Entry_original', 'Organism', 'Entry_Hs', 'Gene_name', 'Protein_name',
       'AA_sequence', 'Prediction', 'Predicted_AHs', '#ofAHs', 
       'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010', 'Korfali_2012',
       'Korfali_2012_NE:MM-ratio', 'Cheng_2019', 'Cheng_2019_Score:Undiff',
       'Cheng_2019_Score:Adipo', 'Cheng_2019_Score:Myo', '#ProteomePapers',
       'HPA', 'HPA_reliability', 'HPA_loc', 'Uniprot_NEorER',
       'Uniprot_subcell', 'Results']]

df_NE_AH = df_NE_AH.rename(columns={'Organism': 'Organism_original', 'Results': 'Uniprot_raw_response'})

In [65]:
df_NE_AH.to_csv('./FinalOutput/AH_NEproteins.csv', index=False)