In [2]:
import os
import re
import pandas as pd

In [3]:
path = "/home/a03-sgoel/FusOn-pLM/Sequences"

### Test Sequences (CAID-2 Disorder-NOX)

In [16]:
def parse_caid_txt(fast_file):
    '''
    Parses correctly fasta-formatted text file with conditions:
    Line 1: ID
    Line 2: Sequence
    Line 3: Label
    '''

    seq_to_label = {}
    id_to_sequence = {}

    with open(fast_file, 'r') as file:
        label = None
        sequence = ""
        seq_id = None
        reading_sequence = False
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if label is not None and sequence:
                    seq_to_label[sequence] = (label, seq_id)
                seq_id = line[1:]  # Capture the ID without the '>'
                label = None
                sequence = ""
                reading_sequence = True
            elif reading_sequence:
                if all(c in "01-" for c in line):
                    label = line
                    reading_sequence = False
                else:
                    sequence += line
        if label is not None and sequence:
            seq_to_label[sequence] = (label, seq_id)

    return seq_to_label

In [17]:
caid_path = path + "/CAID-2_Disorder_NOX_Testing_Sequences.fasta"
caid_dict = parse_caid_txt(caid_path)
print(len(caid_dict))

210


In [18]:
caid_seqs = {}
counter = 0

for k, (v, seq_id) in caid_dict.items():
    if len(k) != len(v):
        print(k, v)
        counter += 1
    caid_seqs[seq_id] = (k, list(map(int, v)))

print(f"Mismatched lengths/labels: {counter}")

Mismatched lengths/labels: 0


In [19]:
caid_df = pd.DataFrame({
    'ID': list(caid_seqs.keys()),
    'Sequence': [seq for seq, _ in caid_seqs.values()],
    'Label': [lbl for _, lbl in caid_seqs.values()],
    'Split': 'Test'
})

caid_df

Unnamed: 0,ID,Sequence,Label,Split
0,DP02342,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Test
1,DP02348,MAPVSGSRSPDREASGSGGRRRSSSKSPKPSKSARSPRGRRSRSHS...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Test
2,DP02361,MSALRRSGYGPSDGPSYGRYYGPGGGDVPVHPPPPLYPLRPEPPQP...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test
3,DP02376,MDGPGASAVVVRVGIPDLQQTKCLRLDPTAPVWAAKQRVLCALNHS...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test
4,DP02411,MKTSPRRPLILKRRRLPLPVQNAPSETSEEEPKRSPAQQESNQAEA...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Test
...,...,...,...,...
205,DP03740,MITAADFYHVMTAMVPLYVAMILAYGSVKWWKIFTPDQCSGINRFV...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test
206,DP03741,MGAQVSSQKVGAHENSNRAYGGSTINYTTINYYRDSASNAASKQDF...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test
207,DP03742,MGARNSVLRGKKADELERIRLRPGGKKKYRLKHIVWAANKLDRFGL...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test
208,DP03743,MSKVIQKKNHWTSRVHECTVKRGPQGELGVTVLGGAEHGEFPYVGA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Test


In [20]:
for idx, row in caid_df.iterrows():
    seq = row['Sequence']
    label = row['Label']

    if len(seq) != len(label):
        print(idx, len(seq), len(label))

In [21]:
caid_df.to_csv(path + "/CAID_Testing_Dataset_july2024.csv", index=False)

### Training Sequences (fIDPnn and IDP-CRF)

#### fIDPnn Training Sequences

In [22]:
def parse_improper_fasta(file_path):
    sequences = []
    labels = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

        current_sequence = ""
        current_labels = []
        is_label = False

        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence and current_labels:
                    sequences.append(current_sequence)
                    labels.append(''.join(current_labels))
                current_sequence = ""
                current_labels = []
                is_label = False
            elif re.match('^[A-Z]+$', line):  # Sequence lines
                current_sequence += line
            else:  # Label lines
                current_labels.append(line)
                is_label = True

        # Add the last sequence and labels
        if current_sequence and current_labels:
            sequences.append(current_sequence)
            labels.append(''.join(current_labels))

    return sequences, labels

In [23]:
fidpnn_path = path + '/flDPnn_Training_Dataset.txt'
fidpnn_seqs, fidpnn_labels = parse_improper_fasta(fidpnn_path)

In [24]:
cleaned_fidpnn_labels = []

for i in range(len(fidpnn_seqs)):
    seq_len = len(fidpnn_seqs[i])
    label = fidpnn_labels[i][:seq_len]
    label = list(map(int, label))
    cleaned_fidpnn_labels.append(label)

In [25]:
print(len(cleaned_fidpnn_labels))
print(len(fidpnn_seqs))

counter = 0
for i in range(len(cleaned_fidpnn_labels)):
    if len(cleaned_fidpnn_labels[i]) != len(fidpnn_seqs[i]):
        print(i, len(cleaned_fidpnn_labels[i]), len(fidpnn_seqs[i]))
        counter += 1

print(f"Mismatched lengths/labels: {counter}")

445
445
Mismatched lengths/labels: 0


In [26]:
fidpnn_df = pd.DataFrame({'Sequence': fidpnn_seqs, 'Label': cleaned_fidpnn_labels, "Split": "Train"})
fidpnn_df

Unnamed: 0,Sequence,Label,Split
0,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
1,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
2,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
3,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
4,MLWQKSTAPEQAPAPPRPYQGVRVKEPVKELLRRKRGHTSVGAAGP...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
...,...,...,...
440,MLRVPEPRPGEAKAEGAAPPTPSKPLTSFLIQDILRDGAQRQGGRT...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
441,MDTKGILLVAVLTALLCLQSGDTLGASWHRPDKCCLGYQKRPLPQV...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
442,MPAENKKVRFENTTSDKGKIPSKVIKSYYGTMDIKKINEGLLDSKI...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
443,MLGIRSSVKTCFKPMSLTSKRLISQSLLASKSTYRTPNFDDVLKEN...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train


In [27]:
fidpnn_df.to_csv(path + "/fIDPnn_Training_Dataset.csv", index=False)

#### IDP-CRF Training Sequences

In [28]:
idp_crf_path = path + '/IDP-CRF_Training_Dataset.txt'
idp_crf_seqs, idp_crf_labels = parse_improper_fasta(idp_crf_path)

In [29]:
print(len(idp_crf_seqs))
print(len(idp_crf_labels))

cleaned_idp_seqs, cleaned_idp_labels = [], []

counter = 0
for i, label in enumerate(idp_crf_labels):
    if len(idp_crf_seqs[i]) != len(idp_crf_labels[i]):
        print(i, len(idp_crf_seqs[i]), len(idp_crf_labels[i]))
        counter += 1

    else:
        label = list(map(int, label))
        cleaned_idp_labels.append(label)
        cleaned_idp_seqs.append(idp_crf_seqs[i])

print(f"Mismatched lengths/labels: {counter}")

5273
5273
4 238 321
Mismatched lengths/labels: 1


In [30]:
idp_crf_df = pd.DataFrame({'Sequence': cleaned_idp_seqs, 'Label': cleaned_idp_labels, "Split": "Train"})
idp_crf_df

Unnamed: 0,Sequence,Label,Split
0,MSKTGKKIAIVTGAGSGVGRAVAVALAGAGYGVALAGRRLDALQET...,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
1,SESRRDGTPRVTRMQVIPVAGRDSMLLNLCGAHAPYFTRNLVILDD...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
2,MANSGEEKLKLYSYWRSSCAHRVRIALALKGLDYEYIPVNLLKGDQ...,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
3,MVAPAGEQGRSSTALSDNPFDAKAWRLVDGFDDLTDITYHRHVDDA...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
4,MPRVKLGTQGLEVSKLGFGCMGLSGDYNDALPEEQGIAVIKEAFNC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
...,...,...,...
5267,DEAQFKECYDTCHKECSDKGNGFTFCEMKCDTDCSVKDVKEKLENY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
5268,YLDSGLGAPVPYPDPLEPKREVCELNPNCDELADHIGFQEAYQRFYGPV,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
5269,GPLSCGRNGGVCIPIRCPVPMRQIGTCFGRPVKCCRSW,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",Train
5270,SGSDGGVCPKILKKCRRDSDCPGACICRGNGYCG,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train


In [31]:
for idx, row in idp_crf_df.iterrows():
    seq = row['Sequence']
    label = row['Label']

    if len(seq) != len(label):
        print(idx, len(seq), len(label))

In [32]:
idp_crf_df.to_csv(path + "/IDP-CRF_Training_Dataset.csv", index=False)

#### Save all train dfs together, removing duplicates

In [33]:
train_df = pd.concat([idp_crf_df, fidpnn_df])
train_df

Unnamed: 0,Sequence,Label,Split
0,MSKTGKKIAIVTGAGSGVGRAVAVALAGAGYGVALAGRRLDALQET...,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
1,SESRRDGTPRVTRMQVIPVAGRDSMLLNLCGAHAPYFTRNLVILDD...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
2,MANSGEEKLKLYSYWRSSCAHRVRIALALKGLDYEYIPVNLLKGDQ...,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
3,MVAPAGEQGRSSTALSDNPFDAKAWRLVDGFDDLTDITYHRHVDDA...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
4,MPRVKLGTQGLEVSKLGFGCMGLSGDYNDALPEEQGIAVIKEAFNC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
...,...,...,...
440,MLRVPEPRPGEAKAEGAAPPTPSKPLTSFLIQDILRDGAQRQGGRT...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
441,MDTKGILLVAVLTALLCLQSGDTLGASWHRPDKCCLGYQKRPLPQV...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
442,MPAENKKVRFENTTSDKGKIPSKVIKSYYGTMDIKKINEGLLDSKI...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
443,MLGIRSSVKTCFKPMSLTSKRLISQSLLASKSTYRTPNFDDVLKEN...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train


In [34]:
train_df = train_df.drop_duplicates(subset=['Sequence'])
train_df = train_df.reset_index(drop=True)
train_df

Unnamed: 0,Sequence,Label,Split
0,MSKTGKKIAIVTGAGSGVGRAVAVALAGAGYGVALAGRRLDALQET...,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
1,SESRRDGTPRVTRMQVIPVAGRDSMLLNLCGAHAPYFTRNLVILDD...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
2,MANSGEEKLKLYSYWRSSCAHRVRIALALKGLDYEYIPVNLLKGDQ...,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
3,MVAPAGEQGRSSTALSDNPFDAKAWRLVDGFDDLTDITYHRHVDDA...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
4,MPRVKLGTQGLEVSKLGFGCMGLSGDYNDALPEEQGIAVIKEAFNC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
...,...,...,...
5300,MDVFMKGLSMAKEGVVAAAEKTKQGVTEAAEKTKEGVLYVGSKTRE...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
5301,MAEEYKNTVPEQETPKVATEESSAPEIKERGMFDFLKKKEEVKPQE...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Train
5302,MATLLRSLALFKRNKDKPPITSGSGGAIRGIKHIIIVPIPGDSSIT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train
5303,MGDWSFLGEFLEEVHKHSTVIGKVWLTVLFIFRMLVLGTAAESSWG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Train


In [35]:
for idx, row in train_df.iterrows():
    seq = row['Sequence']
    label = row['Label']

    if len(seq) != len(label):
        print(idx, len(seq), len(label))

In [36]:
train_df.to_csv(path + "/CAID_Training_Dataset_july2024.csv", index=False)