In [1]:
# proteinGroups.txt 전처리 구문
# v0: proteinGroups.txt 로드 후, contam, reverse, only ident 필터링
# v1.0: 선택 column 제외 필터링
# v1.1: 선택 column제외 필터링 + ';'으로 구분된 데이터 요소 split

# 패키지 불러오기
import os
import pandas as pd
from time import localtime, strftime

# output 파일 위한 naming 함수: 경로+버전+현재시간
def op_address(vers):
    wd = os.getcwd()
    tm = localtime()
    ntm = strftime('%Y%m%d_%H%M%S', tm)
    address = '/outputs/proteinGroups_v'+str(vers)+'-'+ntm+'.txt'
    return address


# proteinGroups.txt 로드.
# ./RawData/TextFiles 경로에 proteinGroups.txt 위치해야.
df = pd.read_table('./RawData/TextFiles/proteinGroups.txt')



# version 0
# contam, reverse, only site 필터링.
df.drop(df[df['Potential contaminant'] == '+'].index, inplace = True)
df.drop(df[df['Reverse'] == '+'].index, inplace = True)
df.drop(df[df['Only identified by site'] == '+'].index, inplace = True)
df.drop(columns=['Only identified by site', 'Reverse', 'Potential contaminant'], inplace=True)
print('message! >>> Dropped 3 columns: Only identified by site, Reverse, Potential contaminant')

# df.copy(deep=True) default : 원본과는 별개의 새로운 df 객체를 만듦. 원본의 데이터가 변경되도 변하지 않아.
delContam = df.copy()
delContam.to_csv(path_or_buf='.'+op_address('0'), sep='\t', index=False, encoding='utf-8')
print('message! >>> '+'.'+op_address('0')+' added.')



# 남겨놓을 column label 제외 필터링.
column_names = list(df)
rest = {'Protein IDs', 'Peptide counts (razor+unique)', 'Protein names', 'Gene names', 'Number of proteins',
        'Razor + unique peptides', 'Unique sequence coverage [%]', 'Sequence lengths' ,'Score', 'Intensity',
        'MS/MS count', 'id', 'Best MS/MS'}
filt = [ele for ele in column_names if ele not in rest]
df.drop(columns=filt, inplace=True)

print('message! >>> Dropped '+str(len(filt)))

# version 1.0
# df.to_csv(path_or_buf='.'+op_address('1.0'), sep='\t', index=False, encoding='utf-8')
# print('message! >>> '+'.'+op_address('1.0')+' added.')



# version 1.1
# ';'으로 구분된 데이터 요소 split후 첫 요소로 값 교체.
# (1) Protein IDs
prot = pd.Series(df["Protein IDs"])
for ele in prot:
    tmp = ele.split(';')[0]
    prot.replace(ele, tmp, inplace=True)
    
# (2) Best MS/MS
bmsms = pd.Series(df["Best MS/MS"])
for ele in bmsms:
    tmp = ele.split(';')[0]
    bmsms.replace(ele, tmp, inplace=True)

# index 넘버링 초기화
df.reset_index(drop=True, inplace=True)

df.to_csv(path_or_buf='.'+op_address('1.1'), sep='\t', index=False, encoding='utf-8')
print('message! >>> '+'.'+op_address('1.1')+' added.')

df.info()

message! >>> Dropped 3 columns: Only identified by site, Reverse, Potential contaminant
message! >>> ./outputs/proteinGroups_v0-20220622_123907.txt added.
message! >>> Dropped 18
message! >>> ./outputs/proteinGroups_v1.1-20220622_123908.txt added.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2790 entries, 0 to 2789
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Protein IDs                    2790 non-null   object 
 1   Peptide counts (razor+unique)  2790 non-null   object 
 2   Protein names                  2783 non-null   object 
 3   Gene names                     2786 non-null   object 
 4   Number of proteins             2790 non-null   int64  
 5   Razor + unique peptides        2790 non-null   int64  
 6   Unique sequence coverage [%]   2790 non-null   float64
 7   Sequence lengths               2790 non-null   object 
 8   Score                          2790 non-