In [1]:
import sys
import os
import numpy as np 
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from rouskinhf import convert

In [2]:
# Import ct file
data = convert(
    'ct',
    'data/',
    name='SARS2',
    path_out='data',
    filter=False
)

Parsing ct files:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing ct files: 100%|██████████| 1/1 [00:00<00:00, 47.17it/s]

Drop 0 datapoints with None values (null sequence or reference)





In [3]:
# Import DMS data
df = pd.DataFrame(data).T
sequence = df['sequence']['SARS']
paired_bases = np.array(df['structure']['SARS'])


In [4]:
import pandas as pd
UKN = -1000
data_dms = pd.read_excel('data/SARS_DMS.xlsx')
data_dms.fillna(UKN, inplace=True)

full_dms = data_dms['Vero (filtered)'].values
max_dms = np.median(full_dms[full_dms!=UKN][full_dms[full_dms!=UKN]>np.percentile(full_dms[full_dms!=UKN], 95)])
full_dms[full_dms!=UKN] =  full_dms[full_dms!=UKN] / max_dms
full_dms[full_dms>1] = 1

''.join(data_dms['Nucleotide'].tolist()) == sequence

True

In [5]:
# Find all non paired bases to get candidate cutting points
all_paired_bases = np.sort(paired_bases.flatten())

len_nonPaired_regions = np.diff(all_paired_bases) - 1
cut_points = np.round( (all_paired_bases[:-1] + all_paired_bases[1:]) / 2).astype(int)

distance_between_cut_points = np.diff(cut_points[len_nonPaired_regions > 14])

In [6]:
print("Number of regions", len(distance_between_cut_points))
px.histogram(distance_between_cut_points, log_y=False)

Number of regions 118


In [7]:
# Find unstructured regions
dot = np.array(['.']*len(sequence))
dot[paired_bases[:,0]] = '('
dot[paired_bases[:,1]] = ')'
dot = ''.join(dot)

unstructured_regions = np.zeros(len(sequence), dtype=bool)

bracket_counter = 0
for i in range(len(dot)):

    if dot[i] == '(':
        bracket_counter += 1
    elif dot[i] == ')':
        bracket_counter -= 1
    else:
        if bracket_counter == 0:
            unstructured_regions[i] = True

    assert bracket_counter >= 0


In [8]:
sub_seqs = []
cut_idxs = []

cut_prev = -1
for i in range(len(cut_points)):
    if (cut_points[i] - cut_prev > 500) and len_nonPaired_regions[i] > 11 and unstructured_regions[cut_points[i]]:
        sub_seqs.append(sequence[cut_prev+1:cut_points[i]+1])
        cut_idxs.append(cut_points[i])

        cut_prev = cut_points[i]

sub_seqs.append(sequence[cut_prev+1:])
cut_idxs.append(len(sequence))

In [9]:
print(np.sum([len(s) for s in sub_seqs]))
print(len(sequence))
px.histogram([len(s) for s in sub_seqs], 
             title=f'Length of sub sequences ({len(sub_seqs)} sequences | min length={min([len(s) for s in sub_seqs])} | max length={max([len(s) for s in sub_seqs])})')

29882
29882


In [14]:
## Output json dataset

data_struct = {'SARS2_'+str(i): {'sequence': sub_seqs[i], 'dms': []} for i in range(len(sub_seqs))}

lens = []
start_idx = -1
for i, end_idx in enumerate(cut_idxs):

    sub_struct = paired_bases[(paired_bases[:,0] > start_idx) & (paired_bases[:,1] <= end_idx)] - start_idx - 1
    sub_dms = full_dms[start_idx+1:end_idx+1]

    # data_struct['SARS2_'+str(i)]['paired_bases'] = sub_struct.tolist()
    data_struct['SARS2_'+str(i)]['dms'] = sub_dms.tolist()
    start_idx = end_idx

import json 
json.dump(data_struct, open('data/sars_dms_fragments.json', 'w'), indent=2)
    

In [13]:
pd.DataFrame.from_dict(data_struct).T

Unnamed: 0,sequence,dms
SARS2_0,AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGA...,"[0.0, -1000.0, -1000.0, 0.011850711812235473, ..."
SARS2_1,AAACUGGAACACUAAACAUAGCAGUGGUGUUACCCGUGAACUCAUG...,"[0.17606771835321278, 0.4972681800692573, 0.62..."
SARS2_2,AUUCAGAAGUAGGACCUGAGCAUAGUCUUGCCGAAUACCAUAAUGA...,"[0.533589842247018, -1000.0, -1000.0, 0.375836..."
SARS2_3,AUUAAGGAGAGUGUUCAGACAUUCUUUAAGCUUGUAAAUAAAUUUU...,"[0.44032320123124274, -1000.0, -1000.0, 0.4172..."
SARS2_4,UUACACCACUGGGCAUUGAUUUAGAUGAGUGGAGUAUGGCUACAUA...,"[-1000.0, -1000.0, 0.6525586764140053, 0.66040..."
SARS2_5,GAGGAGGUGUUGCAGGAGCCUUAAAUAAGGCUACUAACAAUGCCAU...,"[-1000.0, 0.36490958060792605, -1000.0, -1000...."
SARS2_6,UCUAUUAUCUCUAAUGAGAAGCAAGAAAUUCUUGGAACUGUUUCUU...,"[-1000.0, 0.2954982685648326, -1000.0, 0.36629..."
SARS2_7,AUAAAACCUCAUAAUUCACAUGAAGGUAAAACAUUUUAUGUUUUAC...,"[0.44155444401692956, -1000.0, 0.3141208156983..."
SARS2_8,GCUUCAAGAGAGCUUAAAGUUACAUUUUUCCCUGACUUAAAUGGUG...,"[-1000.0, 0.21608310888803384, -1000.0, -1000...."
SARS2_9,CAAAGUUGUUAGUACAACUACUAACAUAGUUACACGGUGUUUAAAC...,"[0.9325125048095421, 0.4848018468641785, 0.389..."
