In [1]:
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import collections
import argparse
from Bio import SeqIO
from itertools import product

In [2]:
# load the train and test datasets accordingly
train_X = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/train_in.csv', skiprows=0)
train_y = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/train_out.csv', skiprows=0)
test_X = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/test_in.csv', skiprows=0)
test_y = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/test_out.csv', skiprows=0)

# check the shape of all datasets
print('train_X = ' + str(train_X.shape) + '\n' + 'train_y = ' + str(train_y.shape) + '\n' + 'test_X = ' + str(test_X.shape) + '\n' + 'test_y = ' + str(test_y.shape) + '\n')

train_X = (304661, 1001)
train_y = (304661, 12)
test_X = (1200, 1001)
test_y = (1200, 12)



In [7]:
train_X.iloc[0, :].sum()

'CCCCATACCCCTAGTCATCCTCGGCAGGTCTCAGTCCCGGCTCCATCTGTGCCCTCGCCCCAGCCGCAGCTATGTTGCACACCGAGGGCCACGCTCTTCTTCGGGCGGTGGGTCAGGGTAAGCTACGCTTGGCCCGTTTGCTTCTGGAGGGAGGCGCCTACGTGAATGAGGGTGATGCGCAGGGGGAGACTGCGCTAATGGCAGCCTGTCGGGCCCGCTACGACGACCCCCAGAACAAGGCACGCATGGTACGCTACCTCCTGGAGCAAGGCGCGGACCCCAATATCGCAGACCGATTAGGGCGCACGGCGCTCATGCACGCTTGCGCCGGGGGTGGGGGCGCCGCGGTGGCCTCGCTGCTCCTTGCCCACGGCGCAGACCCCTCAGTCCGAGATCACGCGGGCGCCTCGGCTCTTGTCCACGCCCTGGACCGCGGGGACCGCGAGACCCTTGCCACACTGCTGGACGCCTGCAAGGCCAAGGGTACGGAGGTCATCATCATCACCACCGATACCTCGCCCTCAGGCACCAAGAAGACCCGGCAGTATCTCAATTCTCCACCATCCCCAGGGGTGGAGGACCCTGCTCCCGCCTCTCCTAGCCCGGGGTTCTGCACGTCGCCTTCGGAAATCCAACTGCAGACCGCTGGAGGAGGAGGGCGTGGGATGTTATCCCCTCGCGCCCAGGAAGAAGAGGAGAAGCGGGACGTATTTGAATTCCCTCTTCCTAAGCCCCCCGATGACCCATCCCCTTCCGAGCCGCTCCCCAAACCACCACGCCATCCCCCAAAACCACTCAAAAGGCTCAACTCCGAGCCCTGGGGCCTAGTGGCCCCTCCTCAACCAGTCCCACCCACTGAAGGGAGACCGGGGATCGAGCGCTTGACTGCCGAATTCAATGGCCTGACCCTGACCGGTCGACCCCGTCTTTCCCGACGTCACAGCACCGAAGGCCCTGAGGACCCGCCCCCATGGGCGGAGAAAGTGACTAGCGGGGGTC

# Performance feature extraction with NAC, DAC & TAC

# Note: Code below assume the input data is always an RNA sequence

In [5]:
# define a function to read the RNA sequence based on specified chunk-length
def read_seq_by_chunksize(sample, window):
    return [sample[i:i+window] for i in range (0, len(sample)) if len(sample[i:i+window]) == window]

def vectorized_inputData(sample, combinations):
    return [sample.count(c) for c in combinations]

# extract the features
def extractFeature_via_nacSeq(k, inputData):
    
    # for RNA sequence only
    nucleotide = ['C', 'G', 'A', 'T']
    
    # perform cartesian-product based on k-value
    p_combinations = sorted([''.join(str(s) for s in t) for t in product(nucleotide, repeat=k)])
    
    # convert the input data to long-string then read the string in chunksize/window
    encode_inputData = list(map(read_seq_by_chunksize, inputData.sum(axis=1).tolist(), [k]*inputData.shape[0]))
    
    # vectorized the output based on each combination found
    vectors = list(map(vectorized_inputData, encode_inputData, [p_combinations]*len(encode_inputData)))
    
    return vectors, p_combinations

In [4]:
# extract features into vectors
vectors_NAC, combinations_NAC = extractFeature_via_nacSeq(1, train_X)
vectors_DAC, combinations_DAC = extractFeature_via_nacSeq(2, train_X)
vectors_TAC, combinations_TAC = extractFeature_via_nacSeq(3, train_X)

In [5]:
df_nac = pd.DataFrame(vectors_NAC, columns=combinations_NAC)
df_dac = pd.DataFrame(vectors_DAC, columns=combinations_DAC)
df_tac = pd.DataFrame(vectors_TAC, columns=combinations_TAC)

In [6]:
df_nac.head()

Unnamed: 0,A,C,G,T
0,187,363,289,162
1,257,220,225,299
2,271,239,214,277
3,301,193,231,276
4,324,135,210,332


In [7]:
df_dac.head()

Unnamed: 0,AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT
0,36,60,59,32,67,144,80,72,63,92,101,33,21,66,49,25
1,73,49,64,70,71,62,12,75,56,53,65,51,57,56,84,102
2,83,56,75,57,92,53,5,89,53,60,53,48,43,69,81,83
3,116,47,82,56,72,48,10,62,54,49,60,68,59,49,79,89
4,123,36,71,94,47,29,6,53,68,29,52,60,85,41,81,125


In [8]:
df_tac.head()

Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,7,6,15,8,4,30,18,8,14,13,...,13,12,11,16,17,5,2,11,9,3
1,35,13,5,20,17,16,3,13,15,16,...,3,22,25,15,32,12,24,18,18,42
2,35,16,16,16,26,12,1,17,15,17,...,1,28,24,27,14,16,16,23,17,26
3,43,20,29,24,21,11,0,14,21,14,...,4,19,22,15,20,22,22,15,16,36
4,51,11,27,34,13,9,1,13,25,5,...,2,17,20,16,23,21,28,19,25,53


In [9]:
# concat all nacSeq into one dataframe
df = pd.concat([df_nac, df_dac, df_tac], axis=1)

In [11]:
# dump dataset to cvs file
df.to_csv('/Users/eesoonhang/Desktop/capstone_data/train_in_encodedToNacSeq.csv', index=False)

# extract feature from test_X dataset

In [6]:
vectors_NAC, combinations_NAC = extractFeature_via_nacSeq(1, test_X)
vectors_DAC, combinations_DAC = extractFeature_via_nacSeq(2, test_X)
vectors_TAC, combinations_TAC = extractFeature_via_nacSeq(3, test_X)

In [7]:
df_nac_test = pd.DataFrame(vectors_NAC, columns=combinations_NAC)
df_dac_test = pd.DataFrame(vectors_DAC, columns=combinations_DAC)
df_tac_test = pd.DataFrame(vectors_TAC, columns=combinations_TAC)

df_test = pd.concat([df_nac_test, df_dac_test, df_tac_test], axis=1)

# dump dataset to cvs file
df_test.to_csv('/Users/eesoonhang/Desktop/capstone_data/test_in_encodedToNacSeq.csv', index=False)