In [1]:
import numpy as np
import csv
import json
from sklearn import preprocessing

## List Names and Filenames

In [2]:
names = [
    'Beam',
    'Eye',
    'Fluke',
    'Fong',
    'Joke',
    'Pod',
    'Tau',
    'Toey',
    'Tong',
]

In [3]:
filenames = [
    'Beam-t1 (1024Hz)',
    'Beam-t2 (1024Hz)',
    'Beam-t3 (1024Hz)',
    'Beam-t6 (1024Hz)',
    'Beam-t7 (1024Hz)',
    'Beam-t8 (1024Hz)',
    'Eye-t2 (1024Hz)',
    'Eye-t3 (1024Hz)',
    'Eye-t4 (1024Hz)',
    'Eye-t5 (1024Hz)',
    'Eye-t6 (1024Hz)',
    'Eye-t7 (1024Hz)',
    'Fluke-t20 (1024Hz)',
    'Fluke-t21 (1024Hz)',
    'Fluke-t22 (1024Hz)',
    'Fluke-t23 (1024Hz)',
    'Fluke-t24 (1024Hz)',
    'Fluke-t25 (1024Hz)',
    'Fong-t3 (1024Hz)',
    'Fong-t4 (1024Hz)',
    'Fong-t5 (1024Hz)',
    'Fong-t6 (1024Hz)',
    'Fong-t7 (1024Hz)',
    'Fong-t8 (1024Hz)',
    'Joke-t1 (1024Hz)',
    'Joke-t2 (1024Hz)',
    'Joke-t3 (1024Hz)',
    'Joke-t4 (1024Hz)',
    'Joke-t5 (1024Hz)',
    'Joke-t6 (1024Hz)',
    'Pod-t21 (1024Hz)',
    'Pod-t23 (1024Hz)',
    'Pod-t24 (1024Hz)',
    'Pod-t25 (1024Hz)',
    'Pod-t26 (1024Hz)',
    'Tau-t13 (1024Hz)',
    'Tau-t14 (1024Hz)',
    'Tau-t15 (1024Hz)',
    'Tau-t16 (1024Hz)',
    'Tau-t17 (1024Hz)',
    'Tau-t18 (1024Hz)',
    'Toey-t1 (1024Hz)',
    'Toey-t2 (1024Hz)',
    'Toey-t3 (1024Hz)',
    'Toey-t4 (1024Hz)',
    'Toey-t5 (1024Hz)',
    'Toey-t6 (1024Hz)',
    'Toey-t7 (1024Hz)',
    'Tong-t1 (1024Hz)',
    'Tong-t2 (1024Hz)',
    'Tong-t3 (1024Hz)',
    'Tong-t4 (1024Hz)',
    'Tong-t5 (1024Hz)',
    'Tong-t6 (1024Hz)'
]

## Attach Labels to Raw data

0: Noise<br>
1: MRCP<br>
2: Not used<br>

In [4]:
raw_data_dir = './data/csv/'
label_dir = './data/label/MRCP_Noise/'
out_dir = './data/data_label/'

## Standardize Time Series Data

Standardizing a dataset involves rescaling the distribution of values so that the mean of observed values is 0 and the standard deviation is 1.

In [5]:
def standardize(X):
    return preprocessing.scale(X)

# Trim by file

In [8]:
out_dir = './data/trim/'
n_col = 19
for i in range(0, len(filenames)):
    print('Processing ' + filenames[i] + '...')
    raw_data = []
    with open(raw_data_dir + filenames[i] + '.csv', 'r') as f_raw_data:
        line = f_raw_data.readline() # Skip CSV header
        while True:
            line = f_raw_data.readline()
            if len(line) == 0:
                break # EOF
            line = line.split(',')
            raw_data.append([])
            for j in range(0, n_col):
                raw_data[-1].append(float(line[j]))
            raw_data[-1].append(2.)   
        f_raw_data.close()
    np_raw_data = np.array(raw_data)
    
    # Trim first and last 2 seconds
    trim_padding = 2048
    np_raw_data = np_raw_data[2048:-2048, :]
    
    # Standardize features
    # np_raw_data[:, np.arange(0, n_col, 1)] = standardize(np_raw_data[:, np.arange(0, n_col, 1)])
    
    with open(label_dir + filenames[i] + '_SAMPLE _ LABEL.txt', 'r') as f_label_data:
        reader = csv.reader(f_label_data, delimiter = '\t')
        for index, [segment, label] in enumerate(reader):
            if index != 0: # Skip TSV header
                start, end = segment.strip('[]').split(',')
                start = int(start) - trim_padding
                end = int(end) - trim_padding
                label_value = 1. if label == 'MRCP' else 0.
                np_raw_data[np.arange(start, end + 1, 1), -1] = np.array([label_value] * (end - start + 1))
        f_label_data.close()
    
    with open(out_dir + filenames[i] + '_data_label.csv', 'wb') as f_out:
        np.savetxt(f_out, np_raw_data, fmt = '%f', delimiter = ",")
        f_out.close()
print('Done.')

Processing Beam-t1 (1024Hz)...
Processing Beam-t2 (1024Hz)...
Processing Beam-t3 (1024Hz)...
Processing Beam-t6 (1024Hz)...
Processing Beam-t7 (1024Hz)...
Processing Beam-t8 (1024Hz)...
Processing Eye-t2 (1024Hz)...
Processing Eye-t3 (1024Hz)...
Processing Eye-t4 (1024Hz)...
Processing Eye-t5 (1024Hz)...
Processing Eye-t6 (1024Hz)...
Processing Eye-t7 (1024Hz)...
Processing Fluke-t20 (1024Hz)...
Processing Fluke-t21 (1024Hz)...
Processing Fluke-t22 (1024Hz)...
Processing Fluke-t23 (1024Hz)...
Processing Fluke-t24 (1024Hz)...
Processing Fluke-t25 (1024Hz)...
Processing Fong-t3 (1024Hz)...
Processing Fong-t4 (1024Hz)...
Processing Fong-t5 (1024Hz)...
Processing Fong-t6 (1024Hz)...
Processing Fong-t7 (1024Hz)...
Processing Fong-t8 (1024Hz)...
Processing Joke-t1 (1024Hz)...
Processing Joke-t2 (1024Hz)...
Processing Joke-t3 (1024Hz)...
Processing Joke-t4 (1024Hz)...
Processing Joke-t5 (1024Hz)...
Processing Joke-t6 (1024Hz)...
Processing Pod-t21 (1024Hz)...
Processing Pod-t23 (1024Hz)...
Pr

In [None]:
# my_data = np.genfromtxt(out_dir + filenames[0] + '_data_label.csv', delimiter = ',')

## Flatten data by file

In [None]:
out_dir = './data/flatten/'
n_col = 19
for i in range(0, len(filenames)):
    print('Processing ' + filenames[i] + '...')
    raw_data = []
    with open(raw_data_dir + filenames[i] + '.csv', 'r') as f_raw_data:
        line = f_raw_data.readline() # Skip CSV header
        while True:
            line = f_raw_data.readline()
            if len(line) == 0:
                break # EOF
            line = line.split(',')
            raw_data.append([])
            for j in range(0, n_col):
                raw_data[-1].append(float(line[j]))
        f_raw_data.close()
    np_raw_data = np.array(raw_data)
    
    # Standardize features
    np_raw_data[:, np.arange(0, n_col, 1)] = standardize(np_raw_data[:, np.arange(0, n_col, 1)])
    
    flatten_data = []
    with open(label_dir + filenames[i] + '_SAMPLE _ LABEL.txt', 'r') as f_label_data:
        reader = csv.reader(f_label_data, delimiter = '\t')
        for index, [segment, label] in enumerate(reader):
            if index != 0: # Skip TSV header
                start, end = segment.strip('[]').split(',')
                start = int(start)
                end = int(end)
                
                label_value = 1. if label == 'MRCP' else 0.
                # ‘F’ means to flatten in column-major (Fortran- style) order. 
                row = np.append(np_raw_data[np.arange(start, end + 1, 1), :].flatten('F'), label_value)
                flatten_data.append(row)
        f_label_data.close()
    
    with open(out_dir + filenames[i] + '_flatten.csv', 'wb') as f_out:
        np.savetxt(f_out, flatten_data, fmt = '%f', delimiter = ",")
        f_out.close()
print('Done.')

In [None]:
# train_data = np.genfromtxt(out_dir + 'Tong-t6 (1024Hz)' + '_flatten.csv', delimiter = ',')

## Flatten data by name (Plain)

In [None]:
out_dir = './data/flatten_individual_trim/'
n_col = 19
for i in range(0, len(names)):
    print('Processing ' + names[i] + '...')
    flatten_data_individual = []
    for j in range(0, len(filenames)):
        name, _ = filenames[j].split('-')
        if name == names[i]:
            print('  ' + filenames[j] + '...')
            raw_data = []
            with open(raw_data_dir + filenames[j] + '.csv', 'r') as f_raw_data:
                line = f_raw_data.readline() # Skip CSV header
                while True:
                    line = f_raw_data.readline()
                    if len(line) == 0:
                        break # EOF
                    line = line.split(',')
                    raw_data.append([])
                    for k in range(0, n_col):
                        raw_data[-1].append(float(line[k]))
                f_raw_data.close()
            np_raw_data = np.array(raw_data)

            # Standardize features
            np_raw_data[:, np.arange(0, n_col, 1)] = standardize(np_raw_data[:, np.arange(0, n_col, 1)])

            # print('    ' + label_dir + filenames[j] + '_SAMPLE _ LABEL.txt')
            with open(label_dir + filenames[j] + '_SAMPLE _ LABEL.txt', 'r') as f_label_data:
                reader = csv.reader(f_label_data, delimiter = '\t')
                for index, [segment, label] in enumerate(reader):
                    if index != 0: # Skip TSV header
                        start, end = segment.strip('[]').split(',')
                        start = int(start)
                        end = int(end)
                        # print('    ' + str(start) +  ' ' + str(end) + ' ' + label)

                        label_value = 1. if label == 'MRCP' else 0.
                        # ‘F’ means to flatten in column-major (Fortran- style) order. 
                        row = np.append(np_raw_data[np.arange(start, end + 1, 1), :].flatten('F'), label_value)
                        flatten_data_individual.append(row)
                f_label_data.close()

    with open(out_dir + names[i] + '_flatten.csv', 'wb') as f_out:
        np.savetxt(f_out, flatten_data_individual, fmt = '%f', delimiter = ",")
        f_out.close()
print('Done.')

## Flatten data by name (Trim)

In [7]:
out_dir = './data/flatten_individual_trim/'
n_col = 19
for i in range(0, len(names)):
    print('Processing ' + names[i] + '...')
    flatten_data_individual = []
    for j in range(0, len(filenames)):
        name, _ = filenames[j].split('-')
        if name == names[i]:
            print('  ' + filenames[j] + '...')
            raw_data = []
            with open(raw_data_dir + filenames[j] + '.csv', 'r') as f_raw_data:
                line = f_raw_data.readline() # Skip CSV header
                while True:
                    line = f_raw_data.readline()
                    if len(line) == 0:
                        break # EOF
                    line = line.split(',')
                    raw_data.append([])
                    for k in range(0, n_col):
                        raw_data[-1].append(float(line[k]))
                f_raw_data.close()
            np_raw_data = np.array(raw_data)
            
            # Trim first and last 2 seconds
            trim_padding = 2048
            np_raw_data = np_raw_data[2048:-2048, :]

            # Standardize features
            np_raw_data[:, np.arange(0, n_col, 1)] = standardize(np_raw_data[:, np.arange(0, n_col, 1)])

            # print('    ' + label_dir + filenames[j] + '_SAMPLE _ LABEL.txt')
            with open(label_dir + filenames[j] + '_SAMPLE _ LABEL.txt', 'r') as f_label_data:
                reader = csv.reader(f_label_data, delimiter = '\t')
                for index, [segment, label] in enumerate(reader):
                    if index != 0: # Skip TSV header
                        start, end = segment.strip('[]').split(',')
                        start = int(start) - trim_padding
                        end = int(end) - trim_padding
                        # print('    ' + str(start) +  ' ' + str(end) + ' ' + label)

                        label_value = 1. if label == 'MRCP' else 0.
                        # ‘F’ means to flatten in column-major (Fortran- style) order. 
                        row = np.append(np_raw_data[np.arange(start, end + 1, 1), :].flatten('F'), label_value)
                        flatten_data_individual.append(row)
                f_label_data.close()

    with open(out_dir + names[i] + '_flatten.csv', 'wb') as f_out:
        np.savetxt(f_out, flatten_data_individual, fmt = '%f', delimiter = ",")
        f_out.close()
print('Done.')

Processing Beam...
  Beam-t1 (1024Hz)...
  Beam-t2 (1024Hz)...
  Beam-t3 (1024Hz)...
  Beam-t6 (1024Hz)...
  Beam-t7 (1024Hz)...
  Beam-t8 (1024Hz)...
Processing Eye...
  Eye-t2 (1024Hz)...
  Eye-t3 (1024Hz)...
  Eye-t4 (1024Hz)...
  Eye-t5 (1024Hz)...
  Eye-t6 (1024Hz)...
  Eye-t7 (1024Hz)...
Processing Fluke...
  Fluke-t20 (1024Hz)...
  Fluke-t21 (1024Hz)...
  Fluke-t22 (1024Hz)...
  Fluke-t23 (1024Hz)...
  Fluke-t24 (1024Hz)...
  Fluke-t25 (1024Hz)...
Processing Fong...
  Fong-t3 (1024Hz)...
  Fong-t4 (1024Hz)...
  Fong-t5 (1024Hz)...
  Fong-t6 (1024Hz)...
  Fong-t7 (1024Hz)...
  Fong-t8 (1024Hz)...
Processing Joke...
  Joke-t1 (1024Hz)...
  Joke-t2 (1024Hz)...
  Joke-t3 (1024Hz)...
  Joke-t4 (1024Hz)...
  Joke-t5 (1024Hz)...
  Joke-t6 (1024Hz)...
Processing Pod...
  Pod-t21 (1024Hz)...
  Pod-t23 (1024Hz)...
  Pod-t24 (1024Hz)...
  Pod-t25 (1024Hz)...
  Pod-t26 (1024Hz)...
Processing Tau...
  Tau-t13 (1024Hz)...
  Tau-t14 (1024Hz)...
  Tau-t15 (1024Hz)...
  Tau-t16 (1024Hz)...
  Tau