In [1]:
import os
from pathlib import Path
import IPython.display as ipd
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
from IPython import display
import time
from datetime import datetime, timedelta
import tensorflow as tf
import seaborn as sns
import subprocess
from sklearn import metrics
%matplotlib inline

tf.logging.set_verbosity(tf.logging.ERROR)

# NOTES
NOTES = "28x28"

# VARS
INPUT_CSV = "keepers_01_01_1234"
CATEGORY = ["no_voice"]
ALL_LABELS = ['zero', 'one', 'two', 'three', 'four', 
            'five', 'six', 'seven', 'eight', 'nine',
            'left', 'right', 'stop', 'go', 'up','down']
LABELS = ALL_LABELS[:] # choose labels to train on
CHANNELS = [1,2,3,4]
NUMS = ''.join([str(x) for x in CHANNELS])
MONTHS = list(range(1,13))
DAYS = list(range(1,32))

target_label = "Label"
id_label = "fname"
OUTSTR = "A{:.4f}_Type{}_UDR{}_THRESH{:5.3f}_DROP{}_LR{}_S{}_B{}{}.csv"
IMG_EXT = ".png"
VERBOSE = True
DISPLAY = True
TEST = False
MFCC = False
TPU = False
RESIZE = True
INPUT_WIDTH = 128
INPUT_HEIGHT = 128
TARGET_WIDTH = 28 if RESIZE else INPUT_WIDTH
TARGET_HEIGHT = 28 if RESIZE else INPUT_HEIGHT
DECAY_RATE = 0.9
IMG_CHANNELS = 3
DROPOUT = 0.4
TYPE = "CNN"
DEFAULT_BS = 128 # default batch size
UNK_DROP_RATE = 1.0 # drop 100% of unknown categories

if TEST:
    LEARNING_STEPS = 100
    SPP = 4
    LEARNING_RATE = .05
    BATCH_SIZE = 32
    VERBOSITY = 1000
    TEST_SIZE = 1000
    SHUFFLE_SIZE = 64
else:
    LEARNING_STEPS = 10000
    SPP = 200
    LEARNING_RATE = .2 # 0.025 for 2 labels
    BATCH_SIZE = 128
    VERBOSITY = 1000
    SHUFFLE_SIZE = 256

def curr_time():
    return datetime.now() - timedelta(hours=7) # offset from UTC to PST

ROOT = os.getcwd() + "/"
if CATEGORY[0] == "no_voice":
    RUN_ROOT = ROOT+"models/"+"NONVOCAL_RUNS_YN_{:02}_{:02}/".format(MONTHS[0], DAYS[0])
else:
    RUN_ROOT = ROOT + "models/" + "VOCAL_RUNS_YN_{:02}_{:02}/".format(MONTHS[0], DAYS[0])
RUN_ROOT_LOG = RUN_ROOT + "logs/"

# PATHS
paths = {
    "Training":ROOT + "train_csv/" + INPUT_CSV + ".csv",
    "Model":RUN_ROOT+"model_dir_{}/".format(NUMS),
    "Logs":RUN_ROOT_LOG+"{}_{}/".format(NUMS, datetime.strftime(curr_time(), "%b%d%Y_%H%M%S"))
}
paths["Log"] = paths["Logs"] + "log.txt"
if not os.path.isdir(RUN_ROOT):
    os.mkdir(RUN_ROOT)
if not os.path.isdir(RUN_ROOT_LOG):
    os.mkdir(RUN_ROOT_LOG)
if not os.path.isdir(paths["Logs"]):
    os.mkdir(paths["Logs"])


compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6



In [2]:
def make_header(s):
    return ("#" * 42) + ("\n{:^42}\n".format(s)) + ("#" * 42)
    
def print_and_log(s):
    with open(paths["Log"], 'a') as log:
        log.write(str(s))
        log.write("\n")
    print(s)
        
def print_and_log_header(s):
    h = make_header(str(s))
    with open(paths["Log"], 'a') as log:
        log.write(h)
        log.write("\n")
    print(h)

In [3]:
def sec_to_str(secs):
    ms = secs - int(secs)
    days = int(secs // (24 * 3600))
    hours = int((secs % ((24 * 3600))) // 3600)
    minutes = int((secs % 3600) // 60)
    seconds = int(secs % 60)
    return "{:02}:{:02}:{:02}:{:02}.{}".format(days, hours, minutes, seconds, "{:.3}".format(ms)[2:])

def timer(f, *args):
    print_and_log("Start: {}".format(curr_time()))
    start = time.time()
    result = f(*args)
    end = time.time()
    print_and_log("End: {}".format(curr_time()))
    print_and_log("Finished in {}".format(sec_to_str(end - start)))
    return result

def preprocess(samples, sample_rate):
    padded = np.zeros(sample_rate)
    samples = samples[:sample_rate]
    padded[:samples.shape[0]] = samples
    return padded

def select_labels(df, allowed):
    return df[df['Label'].isin(allowed)]
    
def select_categories(df, allowed):
    return df[df['Category'].isin(allowed)]

def select_channels(df, allowed):
    labels = []
    for i in range(1, 9):
        if i not in allowed:
            labels.append("Path{}".format(i))
    return df.drop(labels, axis=1)

def select_days(df, allowed):
    return df[df['Day'].isin(allowed)]

def select_months(df, allowed):
    return df[df['Month'].isin(allowed)]

def select_sets(df, allowed):
    return df[df['Set'].isin(allowed)]

def remove_voice(df):
    return df.drop(["Path4"], axis=1)

def str_to_l(x):
    return [int(n) for n in x if n <= '9' and n >= '0']

In [4]:
count = 0
def _parse_function(label, *filenames):
    global count
    count += 1
    if count % VERBOSITY == 0:
        print_and_log("\tProcessed {}th image".format(count))
    expected_shape = tf.constant([1, INPUT_HEIGHT, INPUT_WIDTH, IMG_CHANNELS])
    image = None
    for filename in filenames:
        image_string = tf.read_file(filename)
        image_decoded = tf.image.decode_image(image_string, channels=IMG_CHANNELS)
        image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
        image_decoded = tf.reshape(image_decoded, expected_shape)
        image_decoded = tf.image.rgb_to_grayscale(image_decoded)
        if RESIZE:
            image_decoded = tf.image.resize_bicubic(image_decoded, [TARGET_HEIGHT, TARGET_WIDTH])
        if image is not None:
            image = tf.concat([image, image_decoded], 3)
        else:
            image = image_decoded
    return image, label

In [5]:
def model_fn(features, labels, mode):
    input_layer = tf.reshape(features, [-1, TARGET_HEIGHT, TARGET_WIDTH, len(CHANNELS)])
    pool = input_layer

    for num_filters in [32, 64]:
        conv = tf.layers.conv2d(
            inputs=pool,
            filters=num_filters,
            kernel_size=[5, 5],
            padding="same",
            activation=tf.nn.relu)
        pool = tf.layers.max_pooling2d(inputs=conv, pool_size=[2, 2], strides=2)

    # Dense Layer
    pool = tf.layers.flatten(pool)
    dense = tf.layers.dense(inputs=pool, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=DROPOUT, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits Layer
    logits = tf.layers.dense(inputs=dropout, units=num_labels)
    
    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if not TPU:
        tf.summary.histogram("predictions", predictions["probabilities"])
        tf.summary.histogram("classes", predictions["classes"])

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    learning_rate = tf.train.exponential_decay(LEARNING_RATE, tf.train.get_global_step(), SPP, DECAY_RATE, staircase=True)
    
    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    predictions["loss"] = loss
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        if TPU:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [6]:
def create_training_input_fn(dataset, batch_size, num_epochs=None):
    def _input_fn(num_epochs=None, shuffle=True):
        ds = dataset.batch(batch_size).repeat(num_epochs)
        if shuffle:
            ds = ds.shuffle(SHUFFLE_SIZE)
        feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
        return feature_batch, label_batch
    return _input_fn

def create_predict_input_fn(dataset, batch_size):
    def _input_fn():
        ds = dataset.batch(batch_size)
        feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
        return feature_batch, label_batch
    return _input_fn

def train_helper(steps_per_period):
    classifier.train(
        input_fn=train_input_fn,
        steps=steps_per_period)
    training_stats = classifier.evaluate(input_fn=training_eval_input_fn)
    validation_stats = classifier.evaluate(input_fn=validation_eval_input_fn)
    t_ll = training_stats["loss"]
    t_acc = 100 * training_stats["accuracy"]
    v_ll = validation_stats["loss"]
    v_acc = 100 * validation_stats["accuracy"]
    return classifier, t_ll, v_ll, t_acc, v_acc

def train():
    periods = LEARNING_STEPS // SPP
    steps_per_period = LEARNING_STEPS // periods
    t_accs, v_accs = [], []
    t_lls, v_lls = [], []
    print_and_log("Training model...\nMetrics:")
    print_and_log("\tPERIOD\tRATE\tTYPE\tTRAIN.\tVALID.\tTIME")
    for period in range(periods):
        lr = LEARNING_RATE * (DECAY_RATE ** ((period * SPP) / SPP))
        classifier, t_ll, v_ll, t_acc, v_acc = train_helper(steps_per_period)
        print_and_log("\t{}\t{:.5f}\tLgLs\t{:.2f}\t{:.2f}\t{}".format(period, lr, t_ll, v_ll, curr_time()))
        print_and_log("\t\t\tAcc.\t{:.2f}%\t{:.2f}%\n".format(t_acc, v_acc))
        t_lls.append(t_ll)
        v_lls.append(v_ll)
        t_accs.append(t_acc)
        v_accs.append(v_acc)
    v_accuracy = v_accs[-1]
    return classifier, v_accuracy, t_lls, v_lls, t_accs, v_accs

In [7]:
with open(paths["Log"], 'w') as log:
    log.write(make_header("Starting Script\n"))

In [8]:
# Create variables for the paths
train_csv = paths["Training"]

# Store the labels to train
all_labels = LABELS
labels = LABELS
num_labels = len(labels)# - 1
labels = {x[1]:x[0] for x in enumerate(labels)}
reverse_lookup = {labels[k]:k for k in labels}

In [9]:
print(reverse_lookup)

{0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'left', 11: 'right', 12: 'stop', 13: 'go', 14: 'up', 15: 'down'}


In [10]:
# Make the training data
print_and_log_header("MAKING TRAINING DATA")
train_data = pd.read_csv(train_csv)

# Filter the training data
train_data = select_categories(train_data, CATEGORY)
#train_data = select_channels(train_data, CHANNELS)
train_data = select_labels(train_data, all_labels)
train_data = select_months(train_data, MONTHS)
train_data = select_days(train_data, DAYS)
# train_data = remove_voice(train_data)

train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data["Label"] = train_data["Label"].map(labels)

if VERBOSE:
    print_and_log_header("TRAIN DATA")
    print_and_log(train_data.describe())
    print_and_log(train_data.head(10))

##########################################
           MAKING TRAINING DATA           
##########################################
##########################################
                TRAIN DATA                
##########################################
         Unnamed: 0           Day         Month         Label  SequenceNumber
count  10917.000000  10917.000000  10917.000000  10917.000000    10917.000000
mean    5458.000000     16.173491      6.377576      7.391591       31.575799
std     3151.610779     12.143952      5.498889      4.623246       18.176833
min        0.000000      2.000000      1.000000      0.000000        0.000000
25%     2729.000000      4.000000      1.000000      3.000000       16.000000
50%     5458.000000      7.000000      1.000000      7.000000       32.000000
75%     8187.000000     28.000000     12.000000     11.000000       47.000000
max    10916.000000     31.000000     12.000000     15.000000       64.000000
   Unnamed: 0  Category  Day  Month  Lab

In [11]:
if VERBOSE:
    print_and_log(train_data.head(10))

   Unnamed: 0  Category  Day  Month  Label  SequenceNumber       Set  \
0        1678  no_voice   28     12     10              41  Training   
1        8962  no_voice    4      1      1               0  Training   
2        5525  no_voice    6      1      5               2  Training   
3       10032  no_voice    2      1      2              31  Training   
4       10265  no_voice   27     12      1              37  Training   
5        2993  no_voice    7      1      3               6  Training   
6          72  no_voice   26     12     15              23  Training   
7        8872  no_voice   30     12      1              50  Training   
8         463  no_voice   27     12      7              30  Training   
9        1485  no_voice   27     12     14              21  Training   

                                               Path1  \
0  /Users/kyy/cerebro_train/images_scaled/subvoca...   
1  /Users/kyy/cerebro_train/images_scaled/subvoca...   
2  /Users/kyy/cerebro_train/images_scal

In [12]:
# Grab subset of the data for testing purposes
if TEST:
    train_data = train_data[:TEST_SIZE]

In [13]:
# 80/10/10 training/validation/test split
validation_data = select_sets(train_data, ["Validation"])
test_data = select_sets(train_data, ["Testing"])
train_data = select_sets(train_data, ["Training"])

In [14]:
print(train_data)

       Unnamed: 0  Category  Day  Month  Label  SequenceNumber       Set  \
0            1678  no_voice   28     12     10              41  Training   
1            8962  no_voice    4      1      1               0  Training   
2            5525  no_voice    6      1      5               2  Training   
3           10032  no_voice    2      1      2              31  Training   
4           10265  no_voice   27     12      1              37  Training   
5            2993  no_voice    7      1      3               6  Training   
6              72  no_voice   26     12     15              23  Training   
7            8872  no_voice   30     12      1              50  Training   
8             463  no_voice   27     12      7              30  Training   
9            1485  no_voice   27     12     14              21  Training   
10           7626  no_voice    3      1     10              61  Training   
11           1925  no_voice    6      1      1               0  Training   
12          

In [15]:
ids = test_data["Path{}".format(CHANNELS[0])] # store the png filenames for output
if VERBOSE:
    print_and_log_header("IDS")
    print_and_log(ids.describe())
    print_and_log(ids.head(10))

##########################################
                   IDS                    
##########################################
count                                                  1036
unique                                                 1036
top       /Users/kyy/cerebro_train/images_scaled/subvoca...
freq                                                      1
Name: Path1, dtype: object
13    /Users/kyy/cerebro_train/images_scaled/subvoca...
31    /Users/kyy/cerebro_train/images_scaled/subvoca...
38    /Users/kyy/cerebro_train/images_scaled/subvoca...
40    /Users/kyy/cerebro_train/images_scaled/subvoca...
43    /Users/kyy/cerebro_train/images_scaled/subvoca...
47    /Users/kyy/cerebro_train/images_scaled/subvoca...
57    /Users/kyy/cerebro_train/images_scaled/subvoca...
63    /Users/kyy/cerebro_train/images_scaled/subvoca...
69    /Users/kyy/cerebro_train/images_scaled/subvoca...
76    /Users/kyy/cerebro_train/images_scaled/subvoca...
Name: Path1, dtype: object


In [16]:
# Separate Labels
train_labels = train_data.pop(target_label)
validation_labels = validation_data.pop(target_label)
l_test_labels = test_data.pop(target_label)
img_paths = ["Path{}".format(channel) for channel in CHANNELS]
train_data = train_data[img_paths]
validation_data = validation_data[img_paths]
test_data = test_data[img_paths]

In [17]:
# Vectors of filenames.
t_f, v_f, s_f = [], [], []
for i in range(1, 1 + len(CHANNELS)):
    channel = CHANNELS[i-1]
    l = "Path{}".format(channel)
    t_f.append(tf.constant(train_data[l]))
    v_f.append(tf.constant(validation_data[l]))
    s_f.append(tf.constant(test_data[l]))

# `labels[i]` is the label for the image in `filenames[i]
# Vectors of labels
train_labels = tf.constant(train_labels)
validation_labels = tf.constant(validation_labels)
test_labels = tf.constant(l_test_labels)

# Make datasets from filenames and labels
train_data = tf.data.Dataset.from_tensor_slices((train_labels, *t_f))
validation_data = tf.data.Dataset.from_tensor_slices((validation_labels, *v_f))
test_data = tf.data.Dataset.from_tensor_slices((test_labels, *s_f))
print_and_log_header("Parsing Training Data")
train_data = timer(lambda: train_data.map(_parse_function))
print_and_log_header("Parsing Validation Data")
validation_data = timer(lambda: validation_data.map(_parse_function))
print_and_log_header("Parsing Testing Data")
test_data = timer(lambda: test_data.map(_parse_function))
print_and_log("\nDone!")

##########################################
          Parsing Training Data           
##########################################
Start: 2019-01-06 08:46:38.569938
End: 2019-01-06 08:46:38.864513
Finished in 00:00:00:00.288
##########################################
         Parsing Validation Data          
##########################################
Start: 2019-01-06 08:46:38.871232
End: 2019-01-06 08:46:39.096562
Finished in 00:00:00:00.225
##########################################
           Parsing Testing Data           
##########################################
Start: 2019-01-06 08:46:39.103991
End: 2019-01-06 08:46:39.334254
Finished in 00:00:00:00.229

Done!


In [18]:
print_and_log_header("TRAINING")
print_and_log(train_data)
print_and_log(type(train_data))
print_and_log_header("VALIDATION")
print_and_log(validation_data)
print_and_log(type(validation_data))
print_and_log_header("TESTING")
print_and_log(test_data)
print_and_log(type(test_data))

##########################################
                 TRAINING                 
##########################################
<MapDataset shapes: ((1, 28, 28, 4), ()), types: (tf.float32, tf.int64)>
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
##########################################
                VALIDATION                
##########################################
<MapDataset shapes: ((1, 28, 28, 4), ()), types: (tf.float32, tf.int64)>
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
##########################################
                 TESTING                  
##########################################
<MapDataset shapes: ((1, 28, 28, 4), ()), types: (tf.float32, tf.int64)>
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [19]:
# Create the Estimator
if TPU:
    classifier = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        model_dir=paths["Model"],
        config=tf.contrib.tpu.RunConfig(),
        use_tpu=TPU)
else:
    classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=paths["Model"])

In [20]:
# Create the input functions.
training_eval_input_fn = create_predict_input_fn(train_data, DEFAULT_BS)
validation_eval_input_fn = create_predict_input_fn(validation_data, DEFAULT_BS)
test_eval_input_fn = create_predict_input_fn(test_data, DEFAULT_BS)
train_input_fn = create_training_input_fn(train_data, BATCH_SIZE)

In [None]:
# Train
classifier, accuracy, t_lls, v_lls, t_accs, v_accs = timer(train)

Start: 2019-01-06 08:46:39.366574
Training model...
Metrics:
	PERIOD	RATE	TYPE	TRAIN.	VALID.	TIME
	0	0.20000	LgLs	2.77	2.78	2019-01-06 08:48:31.382325
			Acc.	6.49%	6.49%

	1	0.18000	LgLs	2.77	2.78	2019-01-06 08:50:17.657847
			Acc.	6.49%	6.49%

	2	0.16200	LgLs	2.76	2.77	2019-01-06 08:52:04.332558
			Acc.	8.66%	8.97%

	3	0.14580	LgLs	2.68	2.69	2019-01-06 08:53:50.692497
			Acc.	11.05%	11.74%

	4	0.13122	LgLs	2.71	2.71	2019-01-06 08:55:44.677859
			Acc.	10.70%	10.69%

	5	0.11810	LgLs	2.58	2.60	2019-01-06 08:57:32.769609
			Acc.	17.56%	17.18%

	6	0.10629	LgLs	2.51	2.55	2019-01-06 08:59:22.770067
			Acc.	18.50%	16.51%



In [None]:
print_and_log("Final accuracy (on validation data): {:.4f}%".format(accuracy))

if DISPLAY:
    # Output a graph of loss metrics over periods.
    plt.ylabel("LogLoss")
    plt.xlabel("Periods")
    plt.title("LogLoss vs. Periods")
    plt.plot(t_lls, label="training")
    plt.plot(v_lls, label="validation")
    plt.legend()
    plt.show()
    plt.savefig(paths["Logs"] + "loss.png")

    # Output a graph of accuracy over periods.
    plt.ylabel("Accuracy")
    plt.xlabel("Periods")
    plt.title("Accuracy vs. Periods")
    plt.plot(t_accs, label="training")
    plt.plot(v_accs, label="validation")
    plt.legend()
    plt.show()
    plt.savefig(paths["Logs"] + "accuracy.png")
    


In [None]:
test_stats = classifier.evaluate(input_fn=test_eval_input_fn)
t_ll = test_stats["loss"]
t_acc = 100 * test_stats["accuracy"]
print_and_log_header("TESTING")
print_and_log("\tLog Loss: {:.2f}".format(t_ll))
print_and_log("\tAccuracy: {:.2f}%".format(t_acc))

In [None]:
results = [x for x in classifier.predict(input_fn=test_eval_input_fn)]
classes = [x["classes"] for x in results]

In [None]:
# Output confusion matrix
test_labels_list = l_test_labels.tolist()
# test_labels_list = {reverse_lookup[k] for k in test_labels_list}
# classes = {reverse_lookup[k] for k in classes}

cm = metrics.confusion_matrix(test_labels_list, classes)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class).
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
ax = sns.heatmap(cm_normalized, cmap="bone_r")
ax.set_aspect(1)
ax.set_xticklabels(ALL_LABELS, rotation=45)
ax.set_yticklabels(ALL_LABELS, rotation=0)
plt.title("Confusion matrix")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()

In [None]:
print(cm)

In [None]:
indices_list = []

for i in range(16):
    for j in range(16):
        if j > i:
            indices_list.append([j,i, cm[i][j]])

In [None]:
max_list = indices_list[-10:]

In [None]:
max_list

In [None]:
print("['predict, 'target']")
for lst in max_list:
    print(list(map(reverse_lookup.get, lst[:2])))