# Setup the training data for the model

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3
import gzip
from pathlib import Path

import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

from adna.pylib import consts, utils

In [12]:
GOOD_DIR = consts.DATA_DIR / 'raw' / 'mostly_on_target'
RAW_DIR = consts.DATA_DIR / 'raw' / 'raw_data'
MT_DIR = consts.DATA_DIR / 'raw' / 'reference_mitogenome'

SQL = consts.DATA_DIR / 'UF46992.sqlite'

## Get the positives

In [4]:
GOOD = set()
for path in GOOD_DIR.glob('*.gz'):
    with utils.open_file(path) as fasta_file:
        for rec in SimpleFastaParser(fasta_file):
            GOOD.add(rec[0])

len(GOOD)

2553721

## Read raw data

In [5]:
SEQS = []
for path in RAW_DIR.glob('*.gz'):
    with utils.open_file(path) as fasta_file:
        for rec in SimpleFastaParser(fasta_file):
            id_ = rec[0].replace(' ', '_')
            rev = id_ + '_(reversed)'
            SEQS.append({
                'id': id_,
                'seq': rec[1],
                'label': 1 if id_ in GOOD else 0,
                'rev': 1 if rev in GOOD else 0,
            })

len(SEQS)

10996536

In [6]:
labels = sum(s['label'] for s in SEQS)
revs = sum(s['rev'] for s in SEQS)
labels + revs

2553720

## Write data to database

In [10]:
df = pd.DataFrame(SEQS)
df['split'] = ''
df.head()

Unnamed: 0,id,seq,label,rev,split
0,A00916:157:HLNFGDSX2:2:1101:8377:1000_1:N:0:CG...,GGGTGCACTAATAACTAGCTCAGTGTGTCTACGCCAAATTGACCTA...,1,0,
1,A00916:157:HLNFGDSX2:2:1101:12825:1000_1:N:0:C...,GCATTTCATCAAACTGCGACAAAATCCCATTCCACCCCTACTTCTC...,1,0,
2,A00916:157:HLNFGDSX2:2:1101:13675:1000_1:N:0:C...,TTTTTTGGCCTTCAAGGATGAATTAATGATACGGTTTCGGGTGTAA...,0,0,
3,A00916:157:HLNFGDSX2:2:1101:18539:1000_1:N:0:C...,CTATTCTTCTACCTACGCCTGGCGTACTGCTCCACTATCACACTTT...,0,0,
4,A00916:157:HLNFGDSX2:2:1101:20943:1000_1:N:0:C...,TTTACTGCCTATTTTATCAATTGTCACGAAACAACGTTCCACTTAA...,0,0,


In [11]:
with sqlite3.connect(SQL) as cxn:
    df.to_sql('seqs', cxn, if_exists='replace', index=False)