### **1. Setup**

In [1]:
import os, requests, sys, json
import numpy as np
import pandas as pd
from time import sleep

In [2]:
# Data import
# the product from MemBrain_2nd Part
# The list of proteins in selected organisms
# With subcellular localization info from Uniprot
path = './IntermediateProducts/Result_Selected_Organisms_SubCellLoc.csv'
df = pd.read_csv(path)

In [3]:
df.shape
# df.head()

(2992, 7)

### 2. Merge with proteome data

### 2-1. Schirmer 2003

In [4]:
df_shirmer2003 = pd.read_csv('../Nuclear_proteome/Output/Schirmer2003/Output.csv')

In [5]:
df_shirmer2003.shape
# df_shirmer2003.head()

(56, 6)

#### Inner merge to find nly three matched

In [6]:
df_merged_inner = df.merge(df_shirmer2003, how='inner', left_on='Uniprot_ID', right_on='Entry')

In [7]:
df_merged_inner.shape

(3, 13)

#### Outer merge to assign sub-cell-loc info from Schirmer data

In [8]:
df_merged_outer = df.merge(df_shirmer2003, how='outer', left_on='Uniprot_ID', right_on='Entry')

In [9]:
df_merged_outer.shape

(3045, 13)

In [56]:
df_merged_outer.head()

Unnamed: 0,Uniprot_ID,Organism_x,Protein_name,AH_or_Not,AA_sequence,Prediction,Subcellular_location,NCBI ID,Entry,Entry name,Protein names,Gene names,Organism_y
0,Q8N4K4,Homo sapiens (Human),Reprimo-like protein,Non-AH,MNATFLNHSGLEEVDGVGGGAGAALGNRTHGLGTWLGCCPGGAPLA...,0000000000000000000000000000000000000000000000...,Membrane,,,,,,
1,Q8N4S7,Homo sapiens (Human),Progestin and adipoQ receptor family member 4,Non-AH,MAFLAGPRLLDWASSPPHLQFNKFVLTGYRPASSGSGCLRSLFYLH...,0000000000000000000000000000000000000000000000...,Membrane,,,,,,
2,Q8N5G0,Homo sapiens (Human),Small integral membrane protein 20,Non-AH,MSRNLRTALIFGGFISLIGAAFYPIYFRPLMRLEEYKKEQAINRAG...,0000000000000000000000000000000000000000000000...,Mitochondrion inner membrane,,,,,,
3,Q8N614,Homo sapiens (Human),Transmembrane protein 156,AH,MTKTALLKLFVAIVITFILILPEYFKTPKERTLELSCLEVCLQSNF...,0011111100000000000000000000000000000000000000...,Membrane,,,,,,
4,Q8N7C4,Homo sapiens (Human),Transmembrane protein 217,Non-AH,MKQQQWCGMTAKMGTVLSGVFTIMAVDMYLIFEQKHLGNGSCTEIT...,0000000000000000000000000000000000000000000000...,Membrane,,,,,,


In [10]:
# drop where Uniprot_ID is Nan
df_merged_outer = df_merged_outer.dropna(subset=['Uniprot_ID'])

# drop unnecessary columns
df_merged_outer = df_merged_outer.drop([ 'NCBI ID', 'Entry name',
       'Protein names', 'Gene names', 'Organism_y'], axis=1)

# Assign a value 'Nucleus membrane' to a new column 'Subcellular_location_Schirmer2003' where 'Entry' is present
df_merged_outer.loc[pd.notna(df_merged_outer['Entry']), 'Subcellular_location_Schirmer2003'] = 'Nucleus membrane'

In [12]:
df_merged_outer.info()
# df_merged_outer.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2992 entries, 0 to 2991
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Uniprot_ID                         2992 non-null   object
 1   Organism_x                         2992 non-null   object
 2   Protein_name                       2992 non-null   object
 3   AH_or_Not                          2992 non-null   object
 4   AA_sequence                        2992 non-null   object
 5   Prediction                         2992 non-null   object
 6   Subcellular_location               2992 non-null   object
 7   Entry                              3 non-null      object
 8   Subcellular_location_Schirmer2003  3 non-null      object
dtypes: object(9)
memory usage: 233.8+ KB


In [14]:
# drop an unnecessary column, 'Entry'
df_merged_outer = df_merged_outer.drop(['Entry'], axis=1)