### Input data

1. Download `Mutation_perturbation_model.csv` (training set) from this [repo](https://github.com/jishnu-lab/SWING/tree/main/Data/MutInt_Model). 
2. Download all Uniprot sequences and their identifiers from this [link](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz). 
3. Download the experimentally validated oncoPPI data (`Table S3.xlsx`, validation set) from the Cheng et al. 2021 [repo](https://github.com/ChengF-Lab/oncoPPIs/blob/master/Table%20S3.xlsx). 

In [9]:
import pandas as pd
from Bio import SeqIO
import re

In [10]:
record_dict = SeqIO.to_dict(SeqIO.parse("../uniprot_sprot.fasta", "fasta"))
kv = list(record_dict.items())
record_dict.clear()
for k, v in kv :
    new_k = (k.split('|')[1])
    record_dict[new_k] = str(v.seq)

In [11]:
df = pd.read_csv('Mutation_perturbation_model.csv')

In [12]:
mut_pattern = r'([A-Z])(\d+)([A-Z])'

wt_seq1s = []
wt_seq2s = []
mut_seq1s = []
mut_seq2s = []
targets = []

for i, row in df.iterrows():

    id1 = row['Mutation UPID']
    id2 = row['Interactor UPID']
    target = row['Y2H_score']

    mut = row['Mutation']
    
    seq1 = record_dict[id1]
    seq2 = record_dict[id2]

    mut_match = re.match(mut_pattern, mut)
    wt_res = mut_match.group(1) 
    res_num = int(mut_match.group(2))
    mut_res = mut_match.group(3)

    if wt_res != seq1[res_num-1]:
        print(f'Failed on index {i}')
        continue
    else:
        seq1_l = list(seq1)
        seq1_l[res_num-1] = mut_res
        mut_seq1 = ''.join(seq1_l)

    if seq1 == mut_seq1:
        print(f'Skipping on index {i}')
        continue

    wt_seq1s.append(seq1)
    wt_seq2s.append(seq2) 
    mut_seq1s.append(mut_seq1)
    mut_seq2s.append(seq2)
    targets.append(target)

Failed on index 34
Skipping on index 166
Skipping on index 391
Failed on index 582
Failed on index 1100
Skipping on index 1293
Failed on index 1409
Skipping on index 1670
Skipping on index 1703
Failed on index 1882
Skipping on index 1898
Skipping on index 2413
Skipping on index 2517
Skipping on index 2999
Skipping on index 3114
Skipping on index 3365


In [13]:
mut_df = pd.DataFrame({'seq1': wt_seq1s, 
                       'seq2': wt_seq2s, 
                       'seq1_mut': mut_seq1s,
                       'seq2_mut': mut_seq2s,
                       'target': targets})

In [14]:
mut_df.to_csv('processed_data_cs.csv')

In [16]:
onco_ppi = pd.read_excel('Table S3.xlsx')

In [18]:
mut_pattern = r'([A-Z])(\d+)([A-Z])'

wt_seq1s = []
wt_seq2s = []
mut_seq1s = []
mut_seq2s = []
id1s = []
id2s = []
targets = []

for i, row in onco_ppi.iterrows():
    full_id1 = row['UniProt_ID_a']
    id2 = row['UniProt_ID_b']
    target = row['Growth_score']

    full_id1_split = row['UniProt_ID_a'].split('-')
    
    id1 = full_id1_split[0]
    seq1 = record_dict[id1]
    seq2 = record_dict[id2]

    if len(full_id1_split) > 1:
        mut = full_id1_split[1]
        mut_match = re.match(mut_pattern, mut)
        wt_res = mut_match.group(1) 
        res_num = int(mut_match.group(2))
        mut_res = mut_match.group(3)

        if wt_res != seq1[res_num-1]:
            print(f'Failed on index {i}')
            continue
        else:
            seq1_l = list(seq1)
            seq1_l[res_num-1] = mut_res
            mut_seq1 = ''.join(seq1_l)
    else:
        continue

    if seq1 == mut_seq1:
        print(f'Skipping on index {i}')
        continue

    id1s.append(id1)
    id2s.append(id2)
    wt_seq1s.append(seq1)
    wt_seq2s.append(seq2) 
    mut_seq1s.append(mut_seq1)
    mut_seq2s.append(seq2)
    targets.append(target)

In [19]:
onco_ppi_df = pd.DataFrame({'id1': id1s, 'id2': id2s,
                       'seq1': wt_seq1s, 
                       'seq2': wt_seq2s, 
                       'seq1_mut': mut_seq1s,
                       'seq2_mut': mut_seq2s,
                       'target_og': targets})

In [20]:
onco_ppi_df['target'] = onco_ppi_df['target_og'].apply(lambda x: 1 if x>=3 else 0)

In [21]:
onco_ppi_df.to_csv('processed_data_val_cs.csv')

In [22]:
onco_ppi_df_un = onco_ppi_df[onco_ppi_df['id1'] != onco_ppi_df['id2']]

In [23]:
onco_ppi_df_un.to_csv('processed_data_test_cs.csv')