In [41]:
import envbash
envbash.load.load_envbash('../.env')
from rouskinhf import convert, upload_dataset, get_dataset, dump_json
import pandas as pd

## RouskinHF filtering

In [56]:
data = convert(
    'bpseq',
    'data/bpSEQ_files_pdbee',
    name='PDB',
    path_out='data',
    filter=True
)

data_dict = pd.DataFrame(data).T.drop_duplicates(subset=['sequence'])
data_dict.T.to_json('data/bpSEQ_files_pdbee_filtered.json', indent=2)

data = convert(
    'json',
    'data/bpSEQ_files_pdbee_filtered.json',
    name='PDB',
    path_out='data',
    filter=True
)

Parsing bpseq files: 100%|██████████| 442/442 [00:00<00:00, 12933.73it/s]


Over a total of 442 datapoints, there are:
### OUTPUT
- 392 valid datapoints
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
- 36 duplicate sequences with different structure / dms / shape
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 50 duplicate sequences with the same structure / dms / shape


Parsing json file: 100%|██████████| 356/356 [00:00<00:00, 220981.53it/s]

Over a total of 356 datapoints, there are:
### OUTPUT
- 356 valid datapoints
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
- 0 duplicate sequences with different structure / dms / shape
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape





## To HuggingFace

In [59]:
upload_dataset(
    datapath='data/PDB/data.json',
    exist_ok=True,
    commit_message='updated with new rouskinhf + Supermodels-data',
)

## Make PDB filters

In [18]:
import pandas as pd

pdb_data = get_dataset('PDB')
df_pdb = pd.DataFrame.from_dict(pdb_data, orient='index')
efold_train = get_dataset('efold_train')
df_efold = pd.DataFrame.from_dict(efold_train, orient='index')


In [20]:
df_pdb_filtered = df_pdb[~df_pdb['sequence'].isin(df_efold['sequence'])]

print("removed {}/{}".format(len(df_pdb) - len(df_pdb_filtered), len(df_pdb)))



removed 285/356


In [24]:
def df_to_fastas(df, path_out):
    with open(path_out, 'w') as f:
        for idx, row in df.iterrows():
            f.write('>{}\n{}\n'.format(idx, row['sequence']))

df_to_fastas(df_pdb_filtered, 'data/PDB/data_filtered.fasta')
df_to_fastas(df_efold, 'data/efold_train/data.fasta')

In [29]:
!makeblastdb -in data/efold_train/data.fasta -dbtype nucl -out db/efold_train

82960.30s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Building a new DB, current time: 05/09/2024 14:14:30
New DB name:   /Users/yvesmartin/src/supermodels-data/PDB/db/efold_train
New DB title:  data/efold_train/data.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 339247 sequences in 3.17595 seconds.




In [31]:
!blastn -query data/PDB/data_filtered.fasta -db db/efold_train -out results.txt -outfmt 6 -num_threads 4


83018.49s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [47]:
results = pd.read_csv('results.txt', sep='\t', header=None)
results.columns = ['query', 'subject', 'identity', 'length', 'mismatches', 'gapopenings', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
results.sort_values('evalue', inplace=True, ascending=True)
results.drop_duplicates('query', keep='first', inplace=True)
len(results)

# remove matches from pdb_filtered
df_pdb_filtered = df_pdb_filtered[~df_pdb_filtered['sequence'].isin(results['query'])]

# push pdb_filtered to huggingface
dump_json(df_pdb_filtered.to_dict(orient='index'), 'data/PDB/pdb_filtered.json')
upload_dataset(
    datapath='data/PDB/pdb_filtered.json',
    exist_ok=True,
    commit_message='updated with new rouskinhf + Supermodels-data',
)

In [43]:
df_pdb_filtered

Unnamed: 0,sequence,structure
8S95-2D,CGCCCGGAUAGCUCAGUCGGUAGAGCAGCGGCUAAAACAGCUCUGG...,"[[1, 152], [2, 151], [3, 150], [4, 149], [5, 1..."
1OSW-2D,GGAGGCGCUACGGCGAGGCUCC,"[[0, 21], [1, 20], [2, 19], [3, 18], [5, 14], ..."
8TNS-2D,GUGUGUGUGUGUGUGUGUGUGUGU,[]
7PTK-2D,GGGAGAGUACUAUUCAGAUGCAGACCGCAAGUUCAGAGCGGUUUGC...,"[[0, 354], [1, 353], [2, 352], [3, 351], [4, 3..."
8CLR-2D,GGCAGCUUGCUGCC,"[[0, 13], [1, 12], [2, 11], [3, 10], [4, 9], [..."
...,...,...
6MXQ-2D,GGCAGUAUAGUCCGAACUGCAACUUCGGUUCACCUUCUCUCUAACUGCC,"[[0, 48], [1, 47], [2, 46], [3, 45], [4, 44], ..."
5V17-2D,GGAUCAACCCCAGGUGUGGCACACCAGUCAUACCUUGAUCC,"[[0, 40], [1, 39], [2, 38], [3, 37], [4, 36], ..."
5V16-2D,GGAUCAAUAGCAGGUGUGGCACACCAGUCAUACCUUGAUCC,"[[0, 40], [1, 39], [2, 38], [3, 37], [4, 36], ..."
2MNC-2D,GGGUUGACCGUUGAAUCUCACGG,"[[7, 22], [8, 21], [9, 20], [10, 19], [12, 18]]"
