In [11]:
import numpy as np
import pandas as pd
import os, sys

In [12]:
dataset_name = 'dna_splice_junction'
inp_fname = 'splice.data'

In [13]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [14]:
cols = [
    "Class",
    "Instance"
] + [f"Position_{i}" for i in range(-30, 0)] + [f"Position_+{i}" for i in range(1, 31)]

print(cols)

['Class', 'Instance', 'Position_-30', 'Position_-29', 'Position_-28', 'Position_-27', 'Position_-26', 'Position_-25', 'Position_-24', 'Position_-23', 'Position_-22', 'Position_-21', 'Position_-20', 'Position_-19', 'Position_-18', 'Position_-17', 'Position_-16', 'Position_-15', 'Position_-14', 'Position_-13', 'Position_-12', 'Position_-11', 'Position_-10', 'Position_-9', 'Position_-8', 'Position_-7', 'Position_-6', 'Position_-5', 'Position_-4', 'Position_-3', 'Position_-2', 'Position_-1', 'Position_+1', 'Position_+2', 'Position_+3', 'Position_+4', 'Position_+5', 'Position_+6', 'Position_+7', 'Position_+8', 'Position_+9', 'Position_+10', 'Position_+11', 'Position_+12', 'Position_+13', 'Position_+14', 'Position_+15', 'Position_+16', 'Position_+17', 'Position_+18', 'Position_+19', 'Position_+20', 'Position_+21', 'Position_+22', 'Position_+23', 'Position_+24', 'Position_+25', 'Position_+26', 'Position_+27', 'Position_+28', 'Position_+29', 'Position_+30']


In [15]:
# read file and prep data
path = os.getcwd()
f=open(os.path.join(input_dir, inp_fname), "r")

# puts the data into a list array Nx3, and removes newlines
lines = [line.rstrip('\n') for line in f]


def split(word): return [char for char in word]

all_items = []
for line in lines:
    # strips whitespace
    items = line.replace(" ","")
    items = items.split(",")
    seq_split = split(items[-1])
    items = items[:-1] + seq_split
    all_items.append(items)

data_array = np.asarray(all_items)

data = pd.DataFrame(data_array, columns=cols)
data.head()

Unnamed: 0,Class,Instance,Position_-30,Position_-29,Position_-28,Position_-27,Position_-26,Position_-25,Position_-24,Position_-23,...,Position_+21,Position_+22,Position_+23,Position_+24,Position_+25,Position_+26,Position_+27,Position_+28,Position_+29,Position_+30
0,EI,ATRINS-DONOR-521,C,C,A,G,C,T,G,C,...,A,G,C,C,A,G,T,C,T,G
1,EI,ATRINS-DONOR-905,A,G,A,C,C,C,G,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,BABAPOE-DONOR-30,G,A,G,G,T,G,A,A,...,C,A,C,G,G,G,G,A,T,G
3,EI,BABAPOE-DONOR-867,G,G,G,C,T,G,C,G,...,G,G,T,T,T,T,C,C,C,C
4,EI,BABAPOE-DONOR-2817,G,C,T,C,A,G,C,C,...,C,C,T,T,G,A,C,C,C,T


In [16]:
data.shape

(3190, 62)

In [17]:
id_col = "Instance"
target_col = "Class"

# Remove duplicate instances

In [18]:
# Instance field is not unique. We will keep the first sample, and drop others
data.drop_duplicates(id_col, keep="first", inplace=True)
data.shape

(3178, 62)

# Insert Id Column

In [19]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

# Save Main Data File

In [20]:
data.to_csv(outp_fname, index=False)