In [None]:
import pandas as pd
import json
import numpy as np
import rouskinhf

## Make a blast database

In [None]:
def json_to_fasta(data, path):
    with open(path, 'w') as f:
        for ref, attr in data.items():
            f.write('>' + ref + '\n')
            f.write('N' * 26 + attr['sequence'][26:-21] + 'N' * 21 + '\n')

data = rouskinhf.get_dataset('ribo500')
json_to_fasta(data, 'db/ribo500.fasta')

In [None]:
!makeblastdb -in db/ribo500.fasta -dbtype nucl -out db/ribo500

## Run BLAST on ribo500 vs itself

In [None]:
!blastn -db db/ribo500 -query db/ribo500.fasta -outfmt 6 -out blast_out/ribo500.csv

## Intepret the results
 
- keep the smallest subset of the dataset so that each element of the dataset is in the subset or has a match in the subset

In [None]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('blast_out/ribo500.csv', sep='\t', header=None)
df.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend',
              'sstart', 'send', 'evalue', 'bitscore']

# remove self matches
df = df[df['qseqid'] != df['sseqid']]

# Only keep matches with 80% identity on 80% of the minimal length
# df = df[(df['pident'] >= 80)&(df['length'] >= 112)]

df

In [None]:
# keep the list of matches for each reference
df = df.groupby('qseqid')['sseqid'].apply(list).reset_index()

# Add reads to the dataset
df_reads = pd.read_csv('~/data/ribonanza/train_data.csv')[['sequence_id', 'reads']]
df_reads.drop_duplicates(inplace=True)
df = pd.merge(df, df_reads, left_on='qseqid', right_on='sequence_id', how='inner')
df.drop_duplicates('qseqid', inplace=True)
df = df.sort_values('reads', ascending=False)
non_aligned = set(data.keys()) - set(df['qseqid'])  
df = pd.concat([df, pd.DataFrame({'qseqid': list(non_aligned)})])
df

In [None]:
# keep the smallest number of references that cover 100% of the references
keep = set()
seen = set()
for i, row in tqdm(df.iterrows(), total=len(df)):
    if row['qseqid'] not in seen:
        seen.add(row['qseqid'])
        keep.add(row['qseqid'])
        if not (type(row['sseqid']) == float and np.isnan(row['sseqid'])):
            seen.update(row['sseqid'])

print("Number of references to keep:", len(keep))
print("Number of references to remove:", len(df) - len(keep))

In [None]:
data_out = {}

for ref in keep:
    data_out[ref] = data[ref]
    
data_out

In [None]:
import rouskinhf

rouskinhf.dump_json(
    data_out,
    'data/ribo500-blast.json',
)

In [None]:
rouskinhf.convert(
    'json', 'data/ribo500-blast.json'
)

In [None]:
rouskinhf.upload_dataset(
    'data/ribo500-blast.json',
    exist_ok=True,
)