In [25]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import statistics as st
from scipy import stats
from keras import backend as K
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, Dropout, Bidirectional, LSTM
from tensorflow.python.keras import regularizers

#import sys
#!{sys.executable} -m pip install keras-rectified-adam

In [16]:
def extrapolate_data(X_file, y_file):
    # Read in both CSV files
    df_X = pd.read_csv(X_file)
    df_y = pd.read_csv(y_file)
    
    extrapolated_labels = []
    # Iterate through every item (row) of the y labels
    for label in df_y.iterrows():
        extrapolated_labels += [label[1][0]] * 4
    
    extrapolated_labels_df = pd.DataFrame(extrapolated_labels)
    difference = df_X.shape[0] - extrapolated_labels_df.shape[0]
    df_X = df_X.iloc[:-difference,:]
    
    return df_X, extrapolated_labels_df

"""
    Scale the values of X to make it robust to outliers.
    
    @param df: input dataframe
    @param columns: columns to scale
    @return scaled dataframe
"""
def scale_data(df, columns):
    scaler = StandardScaler()
    scaler = scaler.fit(df[columns])
    df.loc[:, columns] = scaler.transform(df[columns].to_numpy())
    return df

"""
    Create time-series data from our X and y.
"""
def mode_labels(X, y, time_step, step_size):
    X_values = []
    y_values = []
    for i in range(0, len(X) - time_step, step_size):
        value = X.iloc[i:(i + time_step)].values
        labels = y.iloc[i:(i + time_step)]
        X_values.append(value)
        y_values.append(stats.mode(labels)[0][0])
    return np.array(X_values), np.array(y_values).reshape(-1, 1)

def create_time_series_data(X_files, y_files, time_step, step_size):
    all_X = []
    all_y = []
    for i in range(len(y_files)):
        X, y = extrapolate_data(X_files[i], y_files[i])
        X = scale_data(X, list(X.columns.values))
        X, y = mode_labels(X, y, time_step, step_size)
        all_X.append(X)
        all_y.append(y)
    return np.concatenate(all_X), np.concatenate(all_y)

In [17]:
# List of training, validation, and test X_files
X_files = ['TrainingData/subject_001_01__x.csv', 'TrainingData/subject_001_02__x.csv', 
           'TrainingData/subject_001_03__x.csv', 'TrainingData/subject_001_04__x.csv', 
           'TrainingData/subject_001_05__x.csv', 'TrainingData/subject_001_06__x.csv', 
           'TrainingData/subject_001_07__x.csv', 'TrainingData/subject_002_02__x.csv', 
           'TrainingData/subject_002_03__x.csv', 'TrainingData/subject_002_04__x.csv', 
           'TrainingData/subject_002_05__x.csv', 'TrainingData/subject_003_01__x.csv', 
           'TrainingData/subject_003_02__x.csv', 'TrainingData/subject_003_03__x.csv', 
           'TrainingData/subject_004_01__x.csv', 'TrainingData/subject_004_02__x.csv', 
           'TrainingData/subject_005_01__x.csv', 'TrainingData/subject_005_02__x.csv', 
           'TrainingData/subject_005_03__x.csv', 'TrainingData/subject_006_01__x.csv', 
           'TrainingData/subject_006_02__x.csv', 'TrainingData/subject_007_02__x.csv', 
           'TrainingData/subject_007_03__x.csv', 'TrainingData/subject_007_04__x.csv',
           'TrainingData/subject_008_01__x.csv']

val_X_files = ['TrainingData/subject_002_01__x.csv', 'TrainingData/subject_001_08__x.csv']
test_X_files = ['TrainingData/subject_006_03__x.csv', 'TrainingData/subject_007_01__x.csv']

# List of training, validation, and test y_files
y_files = ['TrainingData/subject_001_01__y.csv', 'TrainingData/subject_001_02__y.csv', 
           'TrainingData/subject_001_03__y.csv', 'TrainingData/subject_001_04__y.csv', 
           'TrainingData/subject_001_05__y.csv', 'TrainingData/subject_001_06__y.csv', 
           'TrainingData/subject_001_07__y.csv', 'TrainingData/subject_002_02__y.csv',
           'TrainingData/subject_002_03__y.csv', 'TrainingData/subject_002_04__y.csv', 
           'TrainingData/subject_002_05__y.csv', 'TrainingData/subject_003_01__y.csv', 
           'TrainingData/subject_003_02__y.csv', 'TrainingData/subject_003_03__y.csv', 
           'TrainingData/subject_004_01__y.csv', 'TrainingData/subject_004_02__y.csv', 
           'TrainingData/subject_005_01__y.csv', 'TrainingData/subject_005_02__y.csv', 
           'TrainingData/subject_005_03__y.csv', 'TrainingData/subject_006_01__y.csv', 
           'TrainingData/subject_006_02__y.csv', 'TrainingData/subject_007_02__y.csv', 
           'TrainingData/subject_007_03__y.csv', 'TrainingData/subject_007_04__y.csv',
           'TrainingData/subject_008_01__y.csv']

val_y_files = ['TrainingData/subject_002_01__y.csv', 'TrainingData/subject_001_08__y.csv']
test_y_files = ['TrainingData/subject_006_03__y.csv', 'TrainingData/subject_007_01__y.csv']

In [18]:
training_X, training_y = create_time_series_data(X_files, y_files, 30, 1)
val_X, val_y = create_time_series_data(val_X_files, val_y_files, 30, 1)
test_X, test_y = create_time_series_data(test_X_files, test_y_files, 30, 1)

print(training_X.shape, training_y.shape)
print(val_X.shape, val_y.shape)
print(test_X.shape, test_y.shape)

(1126710, 30, 6) (1126710, 1)
(122696, 30, 6) (122696, 1)
(91260, 30, 6) (91260, 1)


In [21]:
# Save the training data to .npy files so we do not have to generate them repeatedly
np.save('processed_data/training_X.npy', training_X)
np.save('processed_data/training_y.npy', training_y)
np.save('processed_data/val_X.npy', val_X)
np.save('processed_data/val_y.npy', val_y)
np.save('processed_data/test_X.npy', test_X)
np.save('processed_data/test_y.npy', test_y)

In [22]:
# Loading the data back
training_X = np.load('processed_data/training_X.npy')
training_y = np.load('processed_data/training_y.npy')
val_X = np.load('processed_data/val_X.npy')
val_y = np.load('processed_data/val_y.npy')
test_X = np.load('processed_data/test_X.npy')
test_y = np.load('processed_data/test_y.npy')

In [23]:
"""
    We handle the data imbalance by assign higher weights to minority classes.

    @param training_X: training X data
    @param training_y: labels for training data
    @return dictionary of labels as key and weights as values
"""
def get_label_weights(training_X, training_y):
    label_weights = class_weight.compute_class_weight('balanced', np.unique(training_y), training_y.ravel())
    label_weights = {i:label_weights[i] for i in range(len(label_weights))}
    return label_weights

"""
    Perform one-hot encoding of the data to feed into our model.

    @param labels: labels of the training data
    @return one-hot encoded version of the labels
"""
def one_hot_encoding(labels):
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
    encoder = encoder.fit(labels)
    training_y_encoded = encoder.transform(labels)
    return training_y_encoded

In [24]:
# Get training label weights
label_weights = get_label_weights(training_X, training_y)
print(label_weights)

# Perform one-hot encoding on all labels
training_y_encoded = one_hot_encoding(training_y)
val_y_encoded = one_hot_encoding(val_y)
test_y_encoded = one_hot_encoding(test_y)
print(training_y_encoded.shape, val_y_encoded.shape, test_y_encoded.shape)



{0: 0.3258715475333822, 1: 5.860711164745537, 2: 4.427916810764926, 3: 1.8697229376310969}
(1126710, 4) (122696, 4) (91260, 4)


In [26]:
"""
    Calculate recall from predicted and actual values.

    @param y_true: actual y values
    @param y_pred: predicted y values
"""
def recall_measure(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

"""
    Calculate precision from predicted and actual values.

    @param y_true: actual y values
    @param y_pred: predicted y values
"""
def precision_measure(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

"""
    Calculate F1-score from predicted and actual values.

    @param y_true: actual y values
    @param y_pred: predicted y values
"""
def f1(y_true, y_pred):
    precision = precision_measure(y_true, y_pred)
    recall = recall_measure(y_true, y_pred)
    return 2 * ((precision * recall)/(precision + recall + K.epsilon()))

In [39]:
n_timesteps, n_features, n_outputs = training_X.shape[1], training_X.shape[2], training_y_encoded.shape[1]
model = Sequential()
model.add(Bidirectional(LSTM(units = 125), input_shape = (n_timesteps, n_features)))
model.add(Dropout(rate = 0.5))
model.add(Dense(units = 125, activation = 'relu'))
model.add(Dense(n_outputs, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', 
              metrics = ['acc', f1, precision_measure, recall_measure])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_6 (Bidirection (None, 250)               132000    
_________________________________________________________________
dropout_6 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 125)               31375     
_________________________________________________________________
dense_12 (Dense)             (None, 4)                 504       
Total params: 163,879
Trainable params: 163,879
Non-trainable params: 0
_________________________________________________________________


In [40]:
history = model.fit(training_X, training_y_encoded, epochs = 10, batch_size = 64,
                   validation_data = (val_X, val_y_encoded), class_weight = label_weights,
                   verbose = 1, shuffle = True)

Train on 1126710 samples, validate on 122696 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
model.evaluate(test_X, test_y_encoded)



[0.3247964921183148,
 0.9340236686390533,
 0.9339926425365145,
 0.9342043334288789,
 0.9337935568704799]

In [43]:
model.evaluate(val_X, val_y_encoded)



[3.426590192157689,
 0.6226119840907609,
 0.6218399181694828,
 0.6228983695602109,
 0.6208596857273261]

In [44]:
# Get predictions using test data
test_files = ['TestData/subject_009_01__x.csv', 'TestData/subject_010_01__x.csv', 
              'TestData/subject_011_01__x.csv', 'TestData/subject_012_01__x.csv']

y_files = ['TestData/subject_009_01__y_time.csv', 'TestData/subject_010_01__y_time.csv',
           'TestData/subject_011_01__y_time.csv', 'TestData/subject_012_01__y_time.csv']

prediction_files = ['subject_009_01__y_prediction.csv', 'subject_010_01__y_prediction.csv',
                    'subject_011_01__y_prediction.csv', 'subject_012_01__y_prediction.csv']

In [46]:
def create_dataset(X, time_steps, step):
    X_values = []
    for i in range(0, len(X) - time_steps, step):
        value = X.iloc[i:(i + time_steps)].values
        X_values.append(value)        
    return np.array(X_values)

def get_majority(y):
    y_out = []
    for i in range(0, y.shape[0], 4):
        a = list(y[i:i+4])
        y_out.append(max(a, key = a.count))
    return np.array(y_out)

for i in range(len(test_files)):
    input_data = pd.read_csv(test_files[i])
    df = scale_data(input_data, list(input_data.columns.values))
    y_frame = pd.read_csv(y_files[i])
    addl = y_frame.shape[0] * 4 - df.shape[0] + 30
    addl_df = pd.DataFrame(df.iloc[-addl:])
    df = df.append(addl_df)
    X_test = create_dataset(df, 30, 1)
    y_test = model.predict(X_test, batch_size = 64, verbose = 1)
    y_test_bool = np.argmax(y_test, axis = 1)
    y_actual = get_majority(y_test_bool)
    print(y_actual.size)
    y_series = pd.Series(y_actual)
    y_series.to_csv("C2_predictions/" + prediction_files[i])

9497
12269
12939
11329


finish testing subject_009_01__y_prediction.csv
finish testing subject_010_01__y_prediction.csv
finish testing subject_011_01__y_prediction.csv
finish testing subject_012_01__y_prediction.csv
# Final F1-score on hidden test data is 0.8602899935107109