In [2]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import re

# preprocessing mutation.tsv

In [3]:
fileName = r'data/raw/mutations.tsv'
df = pd.read_csv(fileName, sep='\t', keep_default_na=False, lineterminator="\n")
print(df.shape)
df.head()

(77830, 15)


Unnamed: 0,#Feature AC,Feature short label,Feature range(s),Original sequence,Resulting sequence,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,Affected protein organism,Interaction participants,PubMedID,Figure legend,Interaction AC
0,EBI-494382,P42573:p.Cys358Ala,358-358,C,A,mutation(MI:0118),,uniprotkb:P42573,ced-3,,6239 - Caenorhabditis elegans,"uniprotkb:P42573(protein(MI:0326), 6239 - Caen...",9651369,,EBI-494380
1,EBI-516912,Q24306:p.Gly88Ser,88-88,G,S,mutation(MI:0118),,uniprotkb:Q24306,Diap1,,7227 - Drosophila melanogaster (Fruit fly),"uniprotkb:Q24306(protein(MI:0326), 7227 - Dros...",10675328,,EBI-516905
2,EBI-526200,Q99558:p.[Lys429Ala;Lys430Ala],429-429,K,A,mutation(MI:0118),,uniprotkb:Q99558,MAP3K14,,9606 - Homo sapiens,"uniprotkb:Q99558(protein(MI:0326), 9606 - Homo...",10094049,,EBI-526182
3,EBI-526200,Q99558:p.[Lys429Ala;Lys430Ala],430-430,K,A,mutation(MI:0118),,uniprotkb:Q99558,MAP3K14,,9606 - Homo sapiens,"uniprotkb:Q99558(protein(MI:0326), 9606 - Homo...",10094049,,EBI-526182
4,EBI-537514,P07591:p.Cys107Ser,107-107,C,S,mutation(MI:0118),,uniprotkb:P07591,trxm_spiol,,3562 - Spinacia oleracea (Spinach),"uniprotkb:P07591(protein(MI:0326), 3562 - Spin...",11553771,,EBI-537510


In [4]:
# drop high-throught
df = df[~(df['Feature annotation'].str.contains('high-throughput'))]
df.shape

(66802, 15)

## drop entries with >2 participants, and drop entries that the number of partner don't match the number of uniprotAC. (to filter binary protein-protein interaction)

In [5]:
import re
p = re.compile(r'uniprotkb:(.*?)[(]', re.S)
partner = []
n_partner = []
count = 0
for i in df['Interaction participants']:
    tmp = re.findall(p, i)
#     num = re.findall(p2, i)
    num = i.count('|') + 1
    partner.append(tmp)
    n_partner.append(num)
df['partners'] = partner
df['n_partner'] = n_partner

df = df[df['n_partner'] < 3]
print('after delete items with more than 2 partners {}'.format(df.shape))
df = df[df['partners'].apply(lambda x: len(x)) == df['n_partner']]
print('after delete items with not identical number of partners and n_partner {}'.format(df.shape))

after delete items with more than 2 partners (61232, 17)
after delete items with not identical number of partners and n_partner (57954, 17)


## drop entries with same interactionAC but different affected protein AC (drop same interaction with multiple mutations)

In [6]:
df1 = df[df.duplicated(['Affected protein AC', 'Interaction AC'], keep=False)] # choose items with same interactAC-aff pro AC pair
df2 = df.drop_duplicates(['Interaction AC'], keep=False) # choose items with only one time interactionAC
df = pd.concat([df1, df2])
print(df.shape)

(57185, 17)


## drop entries without uniprotAC

In [7]:
df = df[df['Affected protein AC'].str.contains('uniprotkb:', na=False)]
print(df.shape)

(57169, 17)


## simplify uniprotkb label

In [8]:
df['Affected protein AC'] = df['Affected protein AC'].str.replace('uniprotkb:', '')
df.head()

Unnamed: 0,#Feature AC,Feature short label,Feature range(s),Original sequence,Resulting sequence,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,Affected protein organism,Interaction participants,PubMedID,Figure legend,Interaction AC,partners,n_partner
2,EBI-526200,Q99558:p.[Lys429Ala;Lys430Ala],429-429,K,A,mutation(MI:0118),,Q99558,MAP3K14,,9606 - Homo sapiens,"uniprotkb:Q99558(protein(MI:0326), 9606 - Homo...",10094049,,EBI-526182,"[Q99558, O43318-3]",2
3,EBI-526200,Q99558:p.[Lys429Ala;Lys430Ala],430-430,K,A,mutation(MI:0118),,Q99558,MAP3K14,,9606 - Homo sapiens,"uniprotkb:Q99558(protein(MI:0326), 9606 - Homo...",10094049,,EBI-526182,"[Q99558, O43318-3]",2
8,EBI-990886,P00441:p.Gly94Ala,94-94,G,A,mutation(MI:0118),,P00441,SOD1,,9606 - Homo sapiens,"uniprotkb:P16014(protein(MI:0326), 10090 - Mus...",16369483,Figure 1a,EBI-990883,"[P16014, P00441]",2
9,EBI-990898,P00441:p.Ala5Val,5-5,A,V,mutation(MI:0118),,P00441,SOD1,,9606 - Homo sapiens,"uniprotkb:P16014(protein(MI:0326), 10090 - Mus...",16369483,Figure 1a,EBI-990883,"[P16014, P00441]",2
10,EBI-984604,P20536:p.Asp68Asn,68-68,D,N,mutation(MI:0118),,P20536,OPG116,,10249 - Vaccinia virus (strain Copenhagen) (VACV),"uniprotkb:P20536(protein(MI:0326), 10249 - Vac...",16326701,Figure 3,EBI-984592,"[P20536, P20995]",2


## delete 'mutation' feature type

In [9]:
df = df[~df['Feature type'].isin(['mutation(MI:0118)'])]
print(df.shape)

(52194, 17)


## delete non- regular acid items with same featureAC

In [10]:
f_ = df[df['Resulting sequence'].str.contains('B|J|O|Z', na=False)]['#Feature AC'].tolist()
df = df[~df['#Feature AC'].isin(f_)]
print(df.shape)
print(f_)

(52193, 17)
['EBI-8291032']


## delete 'PRO_' uniprotAC in table

In [11]:
df = df[~df['Affected protein AC'].str.contains('PRO_')]
df.shape

(51536, 17)

## get all sequence from uniprot (prepare for uniprot fasta retrieve https://www.uniprot.org/uploadlists/)

In [12]:
def flatlist(acList):
    return [item for sublist in acList for item in sublist]

ac1 = set(flatlist(df['partners'].values.tolist()))
ac2 = set(df['Affected protein AC'].values.tolist())
acAll = ac1 | ac2
with open('acAll.txt', 'w') as f:
    for x in acAll:
        f.write(x + '\n')


In [13]:
p = re.compile('PRO_')
acAll = [x for x in acAll if not p.findall(x)]
len(acAll)

9358

In [14]:
ac = []
info = []
seq = []
seqline = ''
initFlag = True
fastaFile = 'data/raw/idmapping_2024_02_03.fasta' # from uniprot website mapping, download with canonical and isoform
with open(fastaFile, 'r') as f:
    for line in f:
        line = line.strip()
        if '>' in line:
            res = re.findall(r'\|([^"]+)\|', line)[0]
            ac.append(res)
            info.append(line)
            if initFlag:
                initFlag = False
            else:
                seq.append(seqline)
                seqline = ''
        else:
            seqline += line
    seq.append(seqline)
fastaTable = pd.DataFrame({'ac': ac, 'info': info, 'seq': seq})

## select valid uniprotAC to make following selection

In [15]:
validAC1 = fastaTable[fastaTable['ac'].isin(acAll)]

validAC2 = fastaTable[~fastaTable['ac'].isin(acAll)]
acAll_series = pd.Series(list(acAll))
validAC2 = validAC2[validAC2['ac'].isin(acAll_series.str.split('-', expand=True)[0])]
validAC = pd.concat([validAC1, validAC2])
print(validAC1.shape)
print(validAC2.shape)
print(validAC.shape)

(8916, 3)
(416, 3)
(9332, 3)


## make the 'affected protein AC' - 'uniprotAC' dict. Some have 'multiple key' -> 'single value' relationship eg: apac['P19838-1'] = 'P19838', apac['P19838'] = 'P19838'

In [16]:
apacKey = []
acValue = []
for ac in acAll:
    if ac in validAC['ac'].values:
        apacKey.append(ac)
        acValue.append(ac)
    elif ac.split('-')[0] in validAC['ac'].values:
        apacKey.append(ac)
        acValue.append(ac.split('-')[0])
apac2ac = dict(zip(apacKey, acValue))


### transform all isoform AC in table into real uniprotAC(canonical with no isoform '-'), eg: O43889-2 ->O43889, O43889-3 -> O43889-3

In [17]:
df = df[df['Affected protein AC'].isin(apac2ac.keys())]

In [18]:
df = df[df['partners'].apply(lambda x: set(x) < set(list(apac2ac.keys())))]

## make interaction with multi position mutations into one 

In [19]:
# pos = df['Feature range(s)'].str.split('-', expand=True)
# df['start'] = pos[0]
# df['end'] = pos[1]

comCol = df.columns.tolist()
comCol.remove('Feature range(s)')
comCol.remove('Original sequence')
comCol.remove('Resulting sequence')

df_1 = df.groupby('#Feature AC')['Feature range(s)','Original sequence', 'Resulting sequence'].agg(list)
df_2 = df[comCol].drop_duplicates('#Feature AC', keep='first')
df = pd.merge(df_1, df_2, on = '#Feature AC')
df.reset_index(drop=True, inplace=True)
df.shape

  # Remove the CWD from sys.path while we load stuff.


(44412, 17)

In [20]:
df.head()

Unnamed: 0,#Feature AC,Feature range(s),Original sequence,Resulting sequence,Feature short label,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,Affected protein organism,Interaction participants,PubMedID,Figure legend,Interaction AC,partners,n_partner
0,EBI-1002144,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,9606 - Homo sapiens,"uniprotkb:P55957(protein(MI:0326), 9606 - Homo...",16697956,1 B,EBI-1002129,"[P55957, P10415]",2
1,EBI-1002635,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,9606 - Homo sapiens,"uniprotkb:Q92843(protein(MI:0326), 9606 - Homo...",16697956,1 B,EBI-1002627,"[Q92843, P55957]",2
2,EBI-1003351,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,9606 - Homo sapiens,"uniprotkb:Q07817(protein(MI:0326), 9606 - Homo...",16697956,1 B,EBI-1003344,"[Q07817, P55957]",2
3,EBI-1003469,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,9606 - Homo sapiens,"uniprotkb:Q07820(protein(MI:0326), 9606 - Homo...",16697956,1 B,EBI-1003461,"[Q07820, P55957]",2
4,EBI-1003574,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,9606 - Homo sapiens,"uniprotkb:P55957(protein(MI:0326), 9606 - Homo...",16697956,1 B,EBI-1003566,"[P55957, Q16548]",2


## make mutprotein seq and participants seq

In [21]:
validAC_index = validAC.copy()
validAC_index = validAC_index.set_index('ac')

In [22]:
mutAC = [apac2ac[x] for x in df['Affected protein AC']]
mut0 = []
for i in mutAC:
    mut0.append(validAC_index.loc[i, 'seq'])

In [23]:
par = []
parAC = []

for i in df.index:
    sameFlag = False
    if len(df.loc[i, 'partners']) > 1:
        for j in df.loc[i, 'partners']:
            if j != df.loc[i, 'Affected protein AC']:
                par.append(validAC_index.loc[apac2ac[j], 'seq'])
                parAC.append(apac2ac[j])
            elif sameFlag:
                par.append(validAC_index.loc[apac2ac[j], 'seq'])
                parAC.append(apac2ac[j])
            else:
                sameFlag = True
    elif len(df.loc[i, 'partners']) == 1:
        par.append(validAC_index.loc[apac2ac[df.loc[i, 'partners'][0]], 'seq'])
        parAC.append(apac2ac[df.loc[i, 'partners'][0]])
print(len(par))
print(df.shape)

44412
(44412, 17)


In [24]:
df['mutAC'] = mutAC
df['mut0'] = mut0
df['parAC'] = parAC
df['par0'] = par

In [25]:
mut1 = []
for i in df.index:
    tmp = df.loc[i, 'mut0']
    for j in range(len(df.loc[i, 'Feature range(s)'])):
        pos0 = int(df.loc[i, 'Feature range(s)'][j].split('-')[0])
        pos1 = int(df.loc[i, 'Feature range(s)'][j].split('-')[1])
        ori = df.loc[i, 'Original sequence'][j]
        mut = df.loc[i, 'Resulting sequence'][j]
#         try:
#             if len(ori) != len(mut):
#                 print('match length error!' + df.loc[i, '#Feature AC'])
#                 break
#         except:
#             print(df.loc[i, '#Feature AC'] )
        if tmp[(pos0 - 1): pos1] != ori:
            print(df.loc[i, 'Affected protein AC'])
#             mut1.append('error_match')
            continue
        else:
            tmp = tmp[:(pos0 - 1)] + mut + tmp[pos1:]
    tmp = tmp.replace('.', '')
    mut1.append(tmp)
print(len(mut1))

P12104
P12104
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q6NSX1
Q6NSX1
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9HCJ0
Q9NYZ3
Q9HCJ0
Q9HCJ0
44412


In [26]:
df['mut1'] = mut1

In [27]:
df['label'] = 2
df.loc[df['Feature type'].str.contains('disrupting'), 'label'] = 0
df.loc[df['Feature type'].str.contains('decreasing'), 'label'] = 1
df.loc[df['Feature type'].str.contains('increasing'), 'label'] = 3
df.loc[df['Feature type'].str.contains('causing'), 'label'] = 4

In [28]:
df['mutAC1'] = df['mutAC'] + '_' + df['Feature short label']
df['mutAC1'] = df['mutAC1'].str.replace('_p.', '_')
df['mutAC1'] = df['mutAC1'].str.replace('[', '-')
df['mutAC1'] = df['mutAC1'].str.replace(']', '-')
df['mutAC1'] = df['mutAC1'].str.replace(';', '_')

In [29]:
df['Feature type'].value_counts()

mutation with no effect(MI:2226)         13740
mutation disrupting strength(MI:1128)    11793
mutation disrupting(MI:0573)              5426
mutation decreasing(MI:0119)              5152
mutation decreasing strength(MI:1133)     3573
mutation causing(MI:2227)                 1445
mutation increasing(MI:0382)              1311
mutation increasing strength(MI:1132)     1032
mutation disrupting rate(MI:1129)          427
mutation decreasing rate(MI:1130)          362
mutation increasing rate(MI:1131)          151
Name: Feature type, dtype: int64

In [30]:
df['label'].value_counts()

0    17646
2    13740
1     9087
3     2494
4     1445
Name: label, dtype: int64

In [31]:
df.head()

Unnamed: 0,#Feature AC,Feature range(s),Original sequence,Resulting sequence,Feature short label,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,...,Interaction AC,partners,n_partner,mutAC,mut0,parAC,par0,mut1,label,mutAC1
0,EBI-1002144,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,...,EBI-1002129,"[P55957, P10415]",2,P55957,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,P10415,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,0,P55957_P55957:p.-Leu90Ala_Asp95Ala-
1,EBI-1002635,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,...,EBI-1002627,"[Q92843, P55957]",2,P55957,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,Q92843,MATPASAPDTRALVADFVGYKLRQKGYVCGAGPGEGPAADPLHQAM...,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,0,P55957_P55957:p.-Leu90Ala_Asp95Ala-
2,EBI-1003351,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,...,EBI-1003344,"[Q07817, P55957]",2,P55957,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,Q07817,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,0,P55957_P55957:p.-Leu90Ala_Asp95Ala-
3,EBI-1003469,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,...,EBI-1003461,"[Q07820, P55957]",2,P55957,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,0,P55957_P55957:p.-Leu90Ala_Asp95Ala-
4,EBI-1003574,"[90-90, 95-95]","[L, D]","[A, A]",P55957:p.[Leu90Ala;Asp95Ala],mutation disrupting(MI:0573),,P55957,BID,,...,EBI-1003566,"[P55957, Q16548]",2,P55957,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,Q16548,MTDCEFGYIYRLAQDYLQCVLQIPQPGSGPSKTSRVLQNVAFSVQK...,MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPV...,0,P55957_P55957:p.-Leu90Ala_Asp95Ala-


## delete items with same participants (for unknown mutated or wildtype in the interaction)

In [32]:
df = df[~(df['mutAC'] == df['parAC'])]
print('after drop same participants items: {}'.format(df.shape))

after drop same participants items: (40871, 24)


## drop unregular aa

In [33]:
df = df[~(df['mut0'].str.contains('B|J|O|U|X|Z'))]
df = df[~(df['mut1'].str.contains('B|J|O|U|X|Z'))]
df = df[~(df['par0'].str.contains('B|J|O|U|X|Z'))]
print('after drop unregular aa: {}'.format(df.shape))

after drop unregular aa: (40830, 24)


## step for add PSSM(full size) into table

In [34]:
# def load_file(file_path):
#     try:
#         with open(file_path) as f_in:
#             lines = f_in.readlines()
#         return lines
#     except IOError as err:
#         print('Can not open file: ' + file_path)
#         return 'nan'

# def parse_pssm(filelines, winsize=51, pssm_root=None, mutated_pos=None, most1024=False):
# #     pssm_root = '/lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/v20200727/pssm'
#     filelines = load_file(os.path.join(pssm_root, filelines + '.pssm'))
#     if filelines == 'nan':
#         return 'nan'
#     pssmvalue = np.array([])
#     for line in filelines:
#         if len(line.split()) == 44:
#             pssmvalue = np.r_[pssmvalue, np.array(line.split()[2:22]).astype(float)]
#     pssmvalue = np.reshape(pssmvalue, (-1, 20))
#     if pssmvalue.shape[0] < 1024:
#         pssmvalue = np.r_[pssmvalue, np.zeros([1024 - pssmvalue.shape[0], 20])]
#     if most1024:
#         if pssmvalue.shape[0] > 1024:
#             pssmvalue = pssmvalue[:1024, :]
#     if mutated_pos != None:
#         pssmvalue = np.r_[np.zeros([25, 20]), pssmvalue, np.zeros([25, 20])]
#         pssmvalue = pssmvalue[mutated_pos - 1: mutated_pos + 50, :]
    
#     return pssmvalue

## pssm_root saved PPSM files after psiblast batch works

In [35]:
# # please change it into your customized directory
# mut_pssm_root = '/lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/v20200727/pssm'
# ori_pssm_root = '/lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/v20200731/pssm'

In [36]:
# pssm_par0 = [parse_pssm(x, pssm_root=ori_pssm_root) for x in df['parAC']]
# # pssm 51 window only available to single mutation items
# df['pssm_par0'] = pssm_par0
# df = df[~(df['pssm_par0'] == 'nan')]
# print('after pssm_par0: {}'.format(df.shape))

## select only single-point items to support slide window strategy (no need for length strict of mutated protein)

In [37]:
df_s = df.copy()
print('df_copy shape: {}'.format(df_s.shape))

df_s = df_s[~(df_s['Feature range(s)'].str.len() > 1)]
print('after drop multi-point mutation items: {}'.format(df_s.shape))

df_copy shape: (40830, 24)
after drop multi-point mutation items: (37211, 24)


In [38]:
mut0_win = '0' * 51 + df_s['mut0'] + '0' * 51
mut1_win = '0' * 51 + df_s['mut1'] + '0' * 51
mut0_51 = []
mut1_51 = []
df_s['mut0_pad'] = mut0_win
df_s['mut1_pad'] = mut1_win

for i in df_s.index:
    tmp1 = df_s.loc[i, 'mut0_pad']
    tmp2 = df_s.loc[i, 'mut1_pad']
    pos = int(df_s.loc[i, 'Feature range(s)'][0].split('-')[0])
    mut0_51.append(tmp1[(pos + 25):(pos + 76)])
    mut1_51.append(tmp2[(pos + 25):(pos + 76)])

df_s['mut0_51'] = mut0_51
df_s['mut1_51'] = mut1_51

del df_s['mut0_pad']
del df_s['mut1_pad']

In [39]:
mut0_win = '0' * 1025 + df_s['mut0'] + '0' * 1025
mut1_win = '0' * 1025 + df_s['mut1'] + '0' * 1025
mut0_1025 = []
mut1_1025 = []
df_s['mut0_pad'] = mut0_win
df_s['mut1_pad'] = mut1_win

for i in df_s.index:
    tmp1 = df_s.loc[i, 'mut0_pad']
    tmp2 = df_s.loc[i, 'mut1_pad']
    pos = int(df_s.loc[i, 'Feature range(s)'][0].split('-')[0])
    mut0_1025.append(tmp1[(pos + 512):(pos + 1537)])
    mut1_1025.append(tmp2[(pos + 512):(pos + 1537)])

df_s['mut0_1025'] = mut0_1025
df_s['mut1_1025'] = mut1_1025

del df_s['mut0_pad']
del df_s['mut1_pad']

In [40]:
# pssm_win = [parse_pssm(df_s.loc[i, 'mutAC1'], pssm_root=mut_pssm_root, mutated_pos=int(df_s.loc[i, 'Feature range(s)'][0].split('-')[0])) for i in df_s.index]
# # pssm 51 window only available to single mutation items
# df_s['pssm_win_mut1'] = pssm_win
# df_s = df_s[~(df_s['pssm_win_mut1'] == 'nan')]
# print('after pssm_win_mut1: {}'.format(df_s.shape))

# pssm_win = [parse_pssm(df_s.loc[i, 'mutAC'], pssm_root=ori_pssm_root, mutated_pos=int(df_s.loc[i, 'Feature range(s)'][0].split('-')[0])) for i in df_s.index]
# # pssm 51 window only available to single mutation items
# df_s['pssm_win_mut0'] = pssm_win
# df_s = df_s[~(df_s['pssm_win_mut0'] == 'nan')]
# print('after pssm_win_mut0: {}'.format(df_s.shape))

In [41]:
# drop duplicates and coflicts
df_s = df_s.drop_duplicates(['mut0_51', 'mut0_1025', 'mut1_1025', 'par0', 'mut1_51', 'label'], keep='first')
print('after drop duplicates: {}'.format(df_s.shape))

df_s = df_s.drop_duplicates(['mut0_51', 'par0', 'mut1_51', 'mut0_1025', 'mut1_1025'], keep=False)
print('after drop conflicts: {}'.format(df_s.shape))

after drop duplicates: (25613, 28)
after drop conflicts: (23724, 28)


In [42]:
df_s.to_pickle('data/processed_mutations.dataset')

In [43]:
df_s.head()

Unnamed: 0,#Feature AC,Feature range(s),Original sequence,Resulting sequence,Feature short label,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,...,mut0,parAC,par0,mut1,label,mutAC1,mut0_51,mut1_51,mut0_1025,mut1_1025
5,EBI-10039489,[81-81],[V],[E],P28795:p.Val81Glu,mutation disrupting(MI:0573),,P28795,PEX3,,...,MAPNQRSRSLLQRHRGKVLISLTGIAALFTTGSVVVFFVKRWLYKQ...,Q03694,MVLSRGETKKNSVRLTAKQEKKPQSTFQTLKQSLKLSNNKKLKQDS...,MAPNQRSRSLLQRHRGKVLISLTGIAALFTTGSVVVFFVKRWLYKQ...,0,P28795_P28795:p.Val81Glu,IKEQIKRRFEQTQEDSLYTIYELLPVWRMVLNENDLNLDSIVTQLK...,IKEQIKRRFEQTQEDSLYTIYELLPEWRMVLNENDLNLDSIVTQLK...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...
6,EBI-10039495,[188-188],[N],[I],P28795:p.Asn188Ile,mutation decreasing(MI:0119),,P28795,PEX3,,...,MAPNQRSRSLLQRHRGKVLISLTGIAALFTTGSVVVFFVKRWLYKQ...,Q03694,MVLSRGETKKNSVRLTAKQEKKPQSTFQTLKQSLKLSNNKKLKQDS...,MAPNQRSRSLLQRHRGKVLISLTGIAALFTTGSVVVFFVKRWLYKQ...,1,P28795_P28795:p.Asn188Ile,NEYLDSAIKLTMQQENCNKLQNRFYNWVTSWWSDPEDKADDAMVMA...,NEYLDSAIKLTMQQENCNKLQNRFYIWVTSWWSDPEDKADDAMVMA...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...
10,EBI-10039873,[146-146],[K],[R],O95863:p.Lys146Arg,mutation decreasing strength(MI:1133),,O95863,SNAI1,,...,MPRSFLVRKPSDPNRKPNYSELQDSNPEFTFQQPYDQAHLLAAIPP...,Q09472,MAENVVEPGPPSAKRPKLSSPALSASASDGTDFGSLFDLEHDLPDE...,MPRSFLVRKPSDPNRKPNYSELQDSNPEFTFQQPYDQAHLLAAIPP...,1,O95863_O95863:p.Lys146Arg,LEAEAYAAFPGLGQVPKQLAQLSEAKDLQARKAFNCKYCNKEYLSL...,LEAEAYAAFPGLGQVPKQLAQLSEARDLQARKAFNCKYCNKEYLSL...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...
11,EBI-10039883,[187-187],[K],[R],O95863:p.Lys187Arg,mutation decreasing strength(MI:1133),,O95863,SNAI1,,...,MPRSFLVRKPSDPNRKPNYSELQDSNPEFTFQQPYDQAHLLAAIPP...,Q09472,MAENVVEPGPPSAKRPKLSSPALSASASDGTDFGSLFDLEHDLPDE...,MPRSFLVRKPSDPNRKPNYSELQDSNPEFTFQQPYDQAHLLAAIPP...,1,O95863_O95863:p.Lys187Arg,EYLSLGALKMHIRSHTLPCVCGTCGKAFSRPWLLQGHVRTHTGEKP...,EYLSLGALKMHIRSHTLPCVCGTCGRAFSRPWLLQGHVRTHTGEKP...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...
15,EBI-10041181,[368-368],[W],[A],Q9UKV5:p.Trp368Ala,mutation decreasing(MI:0119),,Q9UKV5,AMFR,,...,MPLLFLERFPWPSLRTYTGLSGLALLGTIISAYRALSQPEAGPGEP...,P60604,MAGTALKRLMAEYKQLTLNPPEGIVAGPMNEENFFEWEALIMGPED...,MPLLFLERFPWPSLRTYTGLSGLALLGTIISAYRALSQPEAGPGEP...,1,Q9UKV5_Q9UKV5:p.Trp368Ala,ICWDSMQAARKLPCGHLFHNSCLRSWLEQDTSCPTCRMSLNIADNN...,ICWDSMQAARKLPCGHLFHNSCLRSALEQDTSCPTCRMSLNIADNN...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...


In [44]:
import pandas as pd

# 假设 df_s 是你的 DataFrame，'mut0' 是包含不同长度字符串的列
# 创建一个新列 'length' 包含 'mut0' 列中每个字符串的长度
df_s['length'] = df_s['mut0'].apply(len)

# 定义区间
bins = [0, 200, 400, 600, 800, 1000, float('inf')]

# 使用 cut 函数将长度划分到不同的区间
df_s['length_range'] = pd.cut(df_s['length'], bins=bins, labels=['0-200', '200-400', '400-600', '600-800', '800-1000', '1000+'])

# 统计每个区间的数量
length_distribution = df_s['length_range'].value_counts().sort_index()

print(length_distribution)


0-200       2716
200-400     5484
400-600     5592
600-800     3548
800-1000    2017
1000+       4367
Name: length_range, dtype: int64


In [54]:
df_s[df_s['Affected protein AC'] == 'P42166']

Unnamed: 0,#Feature AC,Feature range(s),Original sequence,Resulting sequence,Feature short label,Feature type,Feature annotation,Affected protein AC,Affected protein symbol,Affected protein full name,...,par0,mut1,label,mutAC1,mut0_51,mut1_51,mut0_1025,mut1_1025,length,length_range
118,EBI-10091531,[690-690],[R],[C],P42166:p.Arg690Cys,mutation decreasing(MI:0119),MI:0612 (comment): variant in CMD1T (Cardiomyo...,P42166,TMPO,,...,METPSQRRATRSGAQASSTPLSPTRITRLQEKEDLQELNDRLAVYI...,MPEFLEDPSVLTKDKLKSELVANNVTLPAGEQRKDVYVQLYLQHLT...,1,P42166_P42166:p.Arg690Cys,KNKLASTPFKGGTLFGGEVCKVIKKRGNKH0000000000000000...,KNKLASTPFKGGTLFGGEVCKVIKKCGNKH0000000000000000...,NDSDRYSDNEEGKKKEHKKVKSTRDIVPFSELGTTPSGGGFFQGIS...,NDSDRYSDNEEGKKKEHKKVKSTRDIVPFSELGTTPSGGGFFQGIS...,694,600-800
