In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, Input, TimeDistributed
from model.load_data import train_test_split, train_test_split_LSTM

from model.scoring_metrics import get_windiff, get_pk, get_k_kappa

from model_trainer_and_tester import read_in_dataset_lstm, test_set_evaluate_multiple_lstm
from tensorflow import keras

In [8]:
for i in range(4):
    temp = [x for x in range(-i, i+1, 1)]
    print(temp)

[0]
[-1, 0, 1]
[-2, -1, 0, 1, 2]
[-3, -2, -1, 0, 1, 2, 3]


In [11]:
batch_size = 64

# I optimize on this, I think?
LSTM_units = 20

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff', 'f0_stds_means']

# Some of the features aren't really relevant, so I'll just go ahead and ignore them
# It's mainly the segID and the start and end times
all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff', 'f0_stds_means']

results_dict = {}

# These are parameters that are constant regardless of the model
batch_size = 64
shifts = [-2, -1, 0, 1, 2]
hidden_units = 100

for i in range(4):
    shifts = [x for x in range(-i, i+1, 1)]
    temp_features = all_features

    n_timesteps = len(shifts)
    feature_count = len(temp_features)

    X_train, Y_train = read_in_dataset_lstm(temp_features, shifts, to_read='train')

    sample_weight = np.ones(shape=(len(Y_train),))
    # I'm gonna increase the weight by the inverse of the proportion of weird examples that there are
    # How I define if there is a weird sample is by summing along the 2D squares to find where there's a 1, and then does a sum of times there's a 1
    # I'm going to do n_timesteps times the inverse count frequency, because in the final version we only predict with the center value. So to correct for this I add this increase
    new_weight = n_timesteps*len(Y_train)/np.sum(Y_train, axis=1).sum()

    # Have to do a flatten() inside because of weird numpy stuff with a length 1 dimension
    sample_weight[(np.sum(Y_train, axis=1) >= 1).flatten()] = new_weight

    model = Sequential()
    # For the input number of units, I'll assume that number of timesteps * features is a good enough value
    model.add(Bidirectional(LSTM(hidden_units, activation='tanh', return_sequences=True, dropout=0.3), input_shape=(n_timesteps, feature_count)))
    model.add(Bidirectional(LSTM(hidden_units, activation='tanh', return_sequences=True, dropout=0.3), input_shape=(n_timesteps, feature_count)))

    model.add(Bidirectional(LSTM(hidden_units, activation='sigmoid', return_sequences=True, dropout=0.3)))
    # This last time distributed is super important, it follows the output structure of the paper I've been following closely
    model.add(TimeDistributed(Dense(1, activation='sigmoid')))

    import tensorflow_addons as tfa
    model.compile(loss='binary_crossentropy', optimizer='RMSprop',
                  metrics=[keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')],
                  weighted_metrics=[]
                  )

    # train the model
    print('Fitting model')
    history = model.fit(X_train, Y_train,
                        batch_size=batch_size,
                        epochs=20,
                        #class_weight= {0:1, 1:10},
                        # sample_weight_mode='temporal',
                        sample_weight=sample_weight,
                        validation_split=0.1,
                        verbose=0
                        )

    temp_results = test_set_evaluate_multiple_lstm(model, temp_features, shifts)

    results_dict[i] = pd.concat([temp_results.mean().add_suffix('_mean'), temp_results.std().add_suffix('_std')])
    print("Finished context size " + str(i))

Fitting model
Finished context size 0
Fitting model
Finished context size 1
Fitting model
Finished context size 2
Fitting model
Finished context size 3


In [12]:
results_df = pd.DataFrame(results_dict)
results_df.to_csv('BLSTM_context_results.csv')