In [1]:
import os

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

if os.getcwd().endswith('notebook'):
    os.chdir('..')

from rna_learn.load import load_dataset
from rna_learn.transform import (
    sequence_embedding, 
    normalize, denormalize,
    make_dataset_balanced,
    one_hot_encode_classes,
    split_train_test_set,
)

In [2]:
sns.set(palette='colorblind', font_scale=1.3)

In [3]:
input_path = os.path.join(os.getcwd(), 'data/ncbi/dataset.csv')
alphabet = ['A', 'T', 'G', 'C']
classes = ['psychrophilic', 'mesophilic', 'thermophilic']

In [4]:
dataset_df = load_dataset(input_path, alphabet)

In [5]:
dataset_df.head()

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,strand,temperature,temperature_range,sequence,gc_content
0,Acetobacter aceti,NZ_CP014692.1,rnpB,888876,889266,390,-,26.0,mesophilic,CCAGACGGTCGGGCGATCGCTGTTGCCTTTCAGGTGATGGAGGAAA...,0.638462
1,Acetobacter aceti,NZ_CP014692.1,ssrA,1260173,1260502,329,+,26.0,mesophilic,GACCTTGCGGAAGGTGATGCATACCCCTATCTTCATGAGTGCAGGA...,0.580547
2,Acetobacter aceti,NZ_CP014692.1,ffs,2223427,2223522,95,-,26.0,mesophilic,AGAGGCCTGTGATGGACGGGCGCCTTGCCAACCCGGTCAGATCCGG...,0.610526
3,Acetobacter aceti,NZ_CP014692.1,rrf,2846207,2846323,116,-,26.0,mesophilic,CCTGGTGGCTATGGCGGGGAGAGATCCACCCGATCCCATCCCGAAC...,0.62931
4,Acetobacter cerevisiae,NZ_LHZA01000109.1,efp,9288,9855,567,+,26.0,mesophilic,ATGAAACAGCAGGCGAACCTGATCCGGGCCGGACAGGTCATCGAGC...,0.569665


In [6]:
y_str, balanced_dataset_df = make_dataset_balanced(
    dataset_df, 
    cat_name='temperature_range',
    classes=classes,
)
y = one_hot_encode_classes(y_str, classes)

sequences = balanced_dataset_df['sequence'].values
x = sequence_embedding(sequences, alphabet)

x_train, y_train, x_test, y_test, train_idx, test_idx = split_train_test_set(
    x, y, test_ratio=0.2, return_indices=True)

In [7]:
output_train_path = os.path.join(os.getcwd(), 'data/dataset_train.csv')
output_test_path = os.path.join(os.getcwd(), 'data/dataset_test.csv')
unused_indices_path = os.path.join(os.getcwd(), 'data/dataset_unused_indices.csv')

In [8]:
# balanced_dataset_df.iloc[train_idx].reset_index(drop=True).to_csv(output_train_path, index=False)
# balanced_dataset_df.iloc[test_idx].reset_index(drop=True).to_csv(output_test_path, index=False)

In [9]:
idx_set = set(train_idx) | set(test_idx)
unused_indices = np.array([
    idx for idx in range(len(dataset_df)) if idx not in idx_set
])[:,np.newaxis]

In [10]:
unused_indices.shape

(40791, 1)

In [11]:
# pd.DataFrame(unused_indices, columns=['index']).to_csv(unused_indices_path, index=False)

In [13]:
l = [len(sequence) for sequence in dataset_df['sequence'].values]
print('min', np.min(l))
print('max', np.max(l))
print('mean', np.mean(l))
print('std', np.std(l))

min 41
max 4413
mean 936.2871911628299
std 707.6570982159337
