In [2]:
import sys
import pandas as pd
sys.path.append(r'..\src')
import numpy as np
from GMfold import gmfold, gm_dot_bracket, gm_s_matrix
from tqdm import tqdm
import time

# Load and process data

In [3]:
# load sequences
df = pd.read_csv(r'..\data\published_clean_files\merged_clean_data.csv')
print(len(df))
# Filter the DataFrame to keep only sequences with lengths between the 5th and 95th percentiles
percentile_95 = np.percentile(df['Sequence'].apply(len), 95) 
percentile_5 = np.percentile(df['Sequence'].apply(len), 5)
df = df[df['Sequence'].apply(len).between(np.floor(percentile_5)+1, np.ceil(percentile_95))]
df = df.reset_index(drop=True)
print('Amount of unique sequences in the dataset:', len(df))
lengths = df['Sequence'].apply(len)
max_length = lengths.max()
print('max length:', lengths.max())
print('min length:', lengths.min())

4933
Amount of unique sequences in the dataset: 4450
max length: 83
min length: 33


In [8]:
# For each of the sequences in the dataset compute the secondary structure, faces and associated energies and structural matrix
d_y= { 'Energy': [], 'time': [], 'Sequence': [] , 'Count': [], 'File':[], 'd_b': [], 'faces': [], 'energy_faces': [], 's_matrix':[]}
for count, seq in tqdm(enumerate(df['Sequence'])):
        start_time = time.time()
        structs = gmfold(seq, l_fix =5)
        d_y['time'].append(time.time()-start_time)
        sss = 0
        for s in structs:     
            sss =s.e
        d_b = gm_dot_bracket( seq, structs)
        d_y['faces'].append([s.desc for s in structs])
        d_y['energy_faces'].append([s.e for s in structs])
        d_y['Energy'].append(sss)
        d_y['Sequence'].append(seq)
        d_y['Count'].append(df['Count'][count])
        d_y['File'].append(df['File'][count])
        d_y['d_b'].append(d_b)
        d_y['s_matrix'].append(gm_s_matrix(seq, structs, max_length).tolist())
df2 = pd.DataFrame(d_y)
df2.to_csv(r'.\test_fold_published.csv', index=False)
df2.head()


4450it [02:09, 34.46it/s]


Unnamed: 0,Energy,time,Sequence,Count,File,d_b,faces,energy_faces,s_matrix
0,1.5,0.020875,ACGACGGGGCACATTGTGCTGTTCATCTGTTCCGCAGGAGAGTCGT,77352,N48 after 13th,(((((...((((...))))............((...))...))))),"[STACK:AC/TG, STACK:CG/GC, STACK:GA/CT, STACK:...","[-1.5, -2.2, -1.3, -1.5, 5.3, -2.2, -1.5, -1.5...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,4.6,0.012468,ACGACCACCTAGGTATTCATGACCGTCTAGAGCTTTCATTGGTCGT,67049,N58 after 16th,(((((((...........((((.............))))))))))),"[STACK:AC/TG, STACK:CG/GC, STACK:GA/CT, STACK:...","[-1.5, -2.2, -1.3, -1.5, -1.8, -1.5, 4.9, -0.9...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,1.5,0.015914,ACGACGGGGCACATTGTGCTATTCAGTTGTTCCGCAGGAGAGTCGT,48708,N58 after 12th,(((((...((((...))))............((...))...))))),"[STACK:AC/TG, STACK:CG/GC, STACK:GA/CT, STACK:...","[-1.5, -2.2, -1.3, -1.5, 5.3, -2.2, -1.5, -1.5...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,1.5,0.016003,ACGACGGGGCACATTGTGCTGTCCATCTGTTCCGCAGGAGAGTCGT,28432,N58 after 16th,(((((...((((...))))............((...))...))))),"[STACK:AC/TG, STACK:CG/GC, STACK:GA/CT, STACK:...","[-1.5, -2.2, -1.3, -1.5, 5.3, -2.2, -1.5, -1.5...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,2.4,0.014,ACGACGGGGCACATTGTGCTGTTCACCTGTTCCGCAGGAGAGTCGT,12126,N48 after 9th,(((((...((((...))))...((.((((.....)))).))))))),"[STACK:AC/TG, STACK:CG/GC, STACK:GA/CT, STACK:...","[-1.5, -2.2, -1.3, -1.5, 4.1, -2.2, -1.5, -1.5...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [4]:
#check that the sequences, associated secondary structures and per face energies in the generated file are the same as those used for the experiments
# in the file fold_published.csv. This check is check is done because order of the sequences within the files may be different.
df1 = pd.read_csv(r'../data/fold_published.csv')
df2 = pd.read_csv(r'.\test_fold_published.csv')
error = []
error1 =[]
error2 = []
for i in range(len(df1)-1):
    if (df1['d_b'][i] != df2['d_b'].loc[df2['Sequence']== df1['Sequence'][i]]).values[0]:
        error.append(i)
    if (df1['faces'][i] != df2['faces'].loc[df2['Sequence']== df1['Sequence'][i]]).values[0]:
        error1.append(i)
    if (df1['energy_faces'][i] != df2['energy_faces'].loc[df2['Sequence']== df1['Sequence'][i]]).values[0]:
        error2.append(i)
print(error, error1,  error2 )

[] [] []
