In [1]:
import os
from pathlib import Path
import IPython.display as ipd
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
from IPython import display
import time
from datetime import datetime, timedelta
import tensorflow as tf
import seaborn as sns
import subprocess
%matplotlib inline

tf.logging.set_verbosity(tf.logging.ERROR)

# NOTES
NOTES = "28x28"

# VARS
target_label = "Label"
id_label = "fname"
VERBOSE = True
DISPLAY = True
TEST = False
TPU = False
RESIZE = True
INPUT_WIDTH = 128
INPUT_HEIGHT = 128
TARGET_WIDTH = 28 if RESIZE else INPUT_WIDTH
TARGET_HEIGHT = 28 if RESIZE else INPUT_HEIGHT
DECAY_RATE = 0.9
IMG_CHANNELS = 3
DROPOUT = 0.4
TYPE = "CNN"
DEFAULT_BS = 128 # default batch size
UNK_DROP_RATE = 1.0 # drop 100% of unknown categories
OUTLIER_PERCENTAGE = 0.1

CATEGORY = ["no_voice"]
LABELS = ["yes", "no"]
CHANNELS = [1, 2, 3, 5, 6, 7, 8]
NUMS = ''.join([str(x) for x in CHANNELS])
MONTHS = [8]
DAYS = [11]

if TEST:
    LEARNING_STEPS = 100
    SPP = 4
    LEARNING_RATE = .05
    BATCH_SIZE = 32
    VERBOSITY = 1000
    TEST_SIZE = 1000
    SHUFFLE_SIZE = 64
else:
    LEARNING_STEPS = 5000
    SPP = 200
    LEARNING_RATE = .025
    BATCH_SIZE = 64
    VERBOSITY = 1000
    SHUFFLE_SIZE = 256

def curr_time():
    return datetime.now() - timedelta(hours=7) # offset from UTC to PST

ROOT = os.getcwd() + "/"
if CATEGORY[0] == "no_voice":
    RUN_ROOT = ROOT+"NONVOCAL_RUNS_YN_{:02}_{:02}/".format(MONTHS[0], DAYS[0])
else:
    RUN_ROOT = ROOT+"VOCAL_RUNS_YN_{:02}_{:02}/".format(MONTHS[0], DAYS[0])
RUN_ROOT_LOG = RUN_ROOT+"logs/"

# PATHS
paths = {
    "Training":ROOT+"paths_scaled_combined.csv",
    "Model":RUN_ROOT+"model_dir_{}/".format(NUMS),
    "Logs":RUN_ROOT_LOG+"{}_{}/".format(NUMS, datetime.strftime(curr_time(), "%b%d%Y_%H%M%S"))
}
paths["Log"] = paths["Logs"] + "log.txt"
if not os.path.isdir(RUN_ROOT):
    os.mkdir(RUN_ROOT)
if not os.path.isdir(RUN_ROOT_LOG):
    os.mkdir(RUN_ROOT_LOG)
if not os.path.isdir(paths["Logs"]):
    os.mkdir(paths["Logs"])

In [2]:
def make_header(s):
    return ("#" * 42) + ("\n{:^42}\n".format(s)) + ("#" * 42)
    
def print_and_log(s):
    with open(paths["Log"], 'a') as log:
        log.write(str(s))
        log.write("\n")
    print(s)
        
def print_and_log_header(s):
    h = make_header(str(s))
    with open(paths["Log"], 'a') as log:
        log.write(h)
        log.write("\n")
    print(h)

In [3]:
def sec_to_str(secs):
    ms = secs - int(secs)
    days = int(secs // (24 * 3600))
    hours = int((secs % ((24 * 3600))) // 3600)
    minutes = int((secs % 3600) // 60)
    seconds = int(secs % 60)
    return "{:02}:{:02}:{:02}:{:02}.{}".format(days, hours, minutes, seconds, "{:.3}".format(ms)[2:])

def timer(f, *args):
    print_and_log("Start: {}".format(curr_time()))
    start = time.time()
    result = f(*args)
    end = time.time()
    print_and_log("End: {}".format(curr_time()))
    print_and_log("Finished in {}".format(sec_to_str(end - start)))
    return result

def preprocess(samples, sample_rate):
    padded = np.zeros(sample_rate)
    samples = samples[:sample_rate]
    padded[:samples.shape[0]] = samples
    return padded

def select_labels(df, allowed):
    return df[df['Label'].isin(allowed)]
    
def select_categories(df, allowed):
    return df[df['Category'].isin(allowed)]

def select_channels(df, allowed):
    labels = []
    for i in range(1, 9):
        if i not in allowed:
            labels.append("Path{}".format(i))
    return df.drop(labels, axis=1)

def select_days(df, allowed):
    return df[df['Day'].isin(allowed)]

def select_months(df, allowed):
    return df[df['Month'].isin(allowed)]

def select_sets(df, allowed):
    return df[df['Set'].isin(allowed)]

def remove_voice(df):
    return df.drop(["Path4"], axis=1)

def str_to_l(x):
    return [int(n) for n in x if n <= '9' and n >= '0']

In [4]:
count = 0
def _parse_function(label, *filenames):
    global count
    count += 1
    if count % VERBOSITY == 0:
        print_and_log("\tProcessed {}th image".format(count))
    expected_shape = tf.constant([1, INPUT_HEIGHT, INPUT_WIDTH, IMG_CHANNELS])
    image = None
    for filename in filenames:
        image_string = tf.read_file(filename)
        image_decoded = tf.image.decode_image(image_string, channels=IMG_CHANNELS)
        image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
        image_decoded = tf.reshape(image_decoded, expected_shape)
        image_decoded = tf.image.rgb_to_grayscale(image_decoded)
        if RESIZE:
            image_decoded = tf.image.resize_bicubic(image_decoded, [TARGET_HEIGHT, TARGET_WIDTH])
        if image is not None:
            image = tf.concat([image, image_decoded], 3)
        else:
            image = image_decoded
    return image, label

In [5]:
def model_fn(features, labels, mode):
    input_layer = tf.reshape(features, [-1, TARGET_HEIGHT, TARGET_WIDTH, len(CHANNELS)])
    pool = input_layer

    for num_filters in [32, 64]:
        conv = tf.layers.conv2d(
            inputs=pool,
            filters=num_filters,
            kernel_size=[5, 5],
            padding="same",
            activation=tf.nn.relu)
        pool = tf.layers.max_pooling2d(inputs=conv, pool_size=[2, 2], strides=2)

    # Dense Layer
    pool = tf.layers.flatten(pool)
    dense = tf.layers.dense(inputs=pool, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=DROPOUT, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits Layer
    logits = tf.layers.dense(inputs=dropout, units=num_labels)
    
    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if not TPU:
        tf.summary.histogram("predictions", predictions["probabilities"])
        tf.summary.histogram("classes", predictions["classes"])

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    learning_rate = tf.train.exponential_decay(LEARNING_RATE, tf.train.get_global_step(), SPP, DECAY_RATE, staircase=True)
    
    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    predictions["loss"] = loss
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        if TPU:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [6]:
def create_training_input_fn(dataset, batch_size, num_epochs=None):
    def _input_fn(num_epochs=None, shuffle=True):
        ds = dataset.batch(batch_size).repeat(num_epochs)
        if shuffle:
            ds = ds.shuffle(SHUFFLE_SIZE)
        feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
        return feature_batch, label_batch
    return _input_fn

def create_predict_input_fn(dataset, batch_size):
    def _input_fn():
        ds = dataset.batch(batch_size)
        feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
        return feature_batch, label_batch
    return _input_fn

In [7]:
with open(paths["Log"], 'w') as log:
    log.write(make_header("Starting Script\n"))

In [8]:
# Create variables for the paths
train_csv = paths["Training"]

# Store the labels to train
all_labels = LABELS
labels = ["yes", "no", "stop", "unknown"]
num_labels = len(labels) - 1
labels = {x[1]:x[0] for x in enumerate(labels)}
reverse_lookup = {labels[k]:k for k in labels}

In [9]:
# Make the training data
print_and_log_header("MAKING TRAINING DATA")
train_data = pd.read_csv(train_csv)

# Filter the training data
train_data = select_categories(train_data, CATEGORY)
train_data = select_channels(train_data, CHANNELS)
train_data = select_labels(train_data, LABELS)
train_data = select_months(train_data, MONTHS)
train_data = select_days(train_data, DAYS)
# train_data = remove_voice(train_data)

train_data = train_data.sample(frac=1).reset_index(drop=True)
tdcopy = pd.DataFrame(train_data)
train_data["Label"] = train_data["Label"].map(labels)

if VERBOSE:
    print_and_log_header("TRAIN DATA")
    print_and_log(train_data.describe())
    print_and_log(train_data.head(10))

##########################################
           MAKING TRAINING DATA           
##########################################
##########################################
                TRAIN DATA                
##########################################
         Day      Label  Month  SequenceNumber
count  862.0  862.00000  862.0       862.00000
mean    11.0    0.50000    8.0       215.00000
std      0.0    0.50029    0.0       124.49088
min     11.0    0.00000    8.0         0.00000
25%     11.0    0.00000    8.0       107.25000
50%     11.0    0.50000    8.0       215.00000
75%     11.0    1.00000    8.0       322.75000
max     11.0    1.00000    8.0       430.00000
   Category  Day  Label  Month  \
0  no_voice   11      1      8   
1  no_voice   11      1      8   
2  no_voice   11      1      8   
3  no_voice   11      0      8   
4  no_voice   11      1      8   
5  no_voice   11      0      8   
6  no_voice   11      1      8   
7  no_voice   11      0      8   
8  no_voice  

In [10]:
# # Separate Labels
train_labels = train_data.pop(target_label)
img_paths = ["Path{}".format(channel) for channel in CHANNELS]
train_data = train_data[img_paths]

In [11]:
# Vectors of filenames.
t_f, v_f, s_f = [], [], []
for i in range(1, 1 + len(CHANNELS)):
    channel = CHANNELS[i-1]
    l = "Path{}".format(channel)
    t_f.append(tf.constant(train_data[l]))

# `labels[i]` is the label for the image in `filenames[i]
# Vectors of labels
train_labels = tf.constant(train_labels)

# Make datasets from filenames and labels
train_data = tf.data.Dataset.from_tensor_slices((train_labels, *t_f))
print_and_log_header("Parsing Training Data")
train_data = timer(lambda: train_data.map(_parse_function))
print_and_log("\nDone!")

##########################################
          Parsing Training Data           
##########################################
Start: 2018-08-13 15:22:55.036593
End: 2018-08-13 15:22:55.855547
Finished in 00:00:00:00.818

Done!


In [12]:
print_and_log_header("TRAINING")
print_and_log(train_data)
print_and_log(type(train_data))

##########################################
                 TRAINING                 
##########################################
<MapDataset shapes: ((1, 28, 28, 7), ()), types: (tf.float32, tf.int64)>
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [13]:
# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=paths["Model"])

In [14]:
# Create the input functions.
training_eval_input_fn = create_predict_input_fn(train_data, DEFAULT_BS)

In [15]:
# Create predicitons and remove 10% lowest confidence rows
results = [x for x in classifier.predict(input_fn=training_eval_input_fn)]
classes = [x["classes"] for x in results]
probs = [x["probabilities"] for x in results]
probs = pd.Series(probs)
probs = probs.apply(lambda x: max(x))
tdf = pd.DataFrame({"Prediction":classes, "Probability":probs})
num_items = tdf.shape[0]
for k in tdcopy:
    tdf[k] = tdcopy[k]
outliers = tdf.nsmallest(int(num_items * OUTLIER_PERCENTAGE), "Probability")
keepers = tdf.append(outliers, ignore_index=True).drop_duplicates(["Day", "Month", "Label", "SequenceNumber"], keep=False).reset_index(drop=True)
outliers = outliers.reset_index(drop=True)
keepers["Label"] = keepers["Label"].apply(lambda x: reverse_lookup[x])
outliers["Label"] = outliers["Label"].apply(lambda x: reverse_lookup[x])
if DISPLAY:
    print_and_log_header("OUTLIERS")
    display.display(outliers.describe())
    print_and_log_header("KEEPERS")
    display.display(keepers.describe())

##########################################
                 OUTLIERS                 
##########################################


Unnamed: 0,Prediction,Probability,Day,Month,SequenceNumber
count,86.0,86.0,86.0,86.0,86.0
mean,0.593023,0.504382,11.0,8.0,205.709302
std,0.494152,0.002553,0.0,0.0,121.286037
min,0.0,0.500165,11.0,8.0,14.0
25%,0.0,0.501995,11.0,8.0,106.5
50%,1.0,0.504661,11.0,8.0,200.5
75%,1.0,0.506516,11.0,8.0,305.25
max,1.0,0.508932,11.0,8.0,424.0


##########################################
                 KEEPERS                  
##########################################


Unnamed: 0,Prediction,Probability,Day,Month,SequenceNumber
count,776.0,776.0,776.0,776.0,776.0
mean,0.634021,0.574046,11.0,8.0,216.029639
std,0.482014,0.051658,0.0,0.0,124.874845
min,0.0,0.508962,11.0,8.0,0.0
25%,0.0,0.533736,11.0,8.0,107.75
50%,1.0,0.56347,11.0,8.0,218.0
75%,1.0,0.60221,11.0,8.0,323.25
max,1.0,0.882841,11.0,8.0,430.0


In [16]:
# Drop unnecessary columns
keepers.drop(["Prediction", "Probability"], axis=1, inplace=True)
outliers.drop(["Prediction", "Probability"], axis=1, inplace=True)

In [17]:
s = outliers["Set"]
tra = sum(s.apply(lambda x: 1 if x == "Training" else 0))
val = sum(s.apply(lambda x: 1 if x == "Validation" else 0))
tst = sum(s.apply(lambda x: 1 if x == "Testing" else 0))
print("OUTLIERS\n\tTraining:\t{}\n\tValidation:\t{}\n\tTesting:\t{}".format(tra, val, tst))

OUTLIERS
	Training:	62
	Validation:	15
	Testing:	9


In [18]:
KEEPERS_NAME = "keepers_{:02}_{:02}_{}.csv".format(MONTHS[0], DAYS[0], NUMS)
OUTLIERS_NAME = "outliers_{:02}_{:02}_{}.csv".format(MONTHS[0], DAYS[0], NUMS)
keepers.to_csv(KEEPERS_NAME)
print("Saved KEEPERS to '{}'".format(KEEPERS_NAME))
outliers.to_csv(OUTLIERS_NAME)
print("Saved OUTLIERS to '{}'".format(OUTLIERS_NAME))

Saved KEEPERS to 'keepers_08_11_1235678.csv'
Saved OUTLIERS to 'outliers_08_11_1235678.csv'
