In [1]:
# import packages
import pandas as pd
import boto3
import numpy as np

# tf
import tensorflow as tf
import tensorflow_hub as hub

# set working directory
import os
path = "/home/mriveralanas/projects/quora/"
os.chdir(path)

# user modules
import src.data.process_data as process_data


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

# !pip install -q tensorflow-hub
# !pip install -q tensorflow-datasets

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.0.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


In [3]:
# load train.csv and split into training, validation and test 
train_data, validation_data, test_data = process_data.train_split(process_data.retrieve_training())

Examples:
    Total: 1306122
    Positive: 80810 (6.19% of total)



In [4]:

# show example from train data 
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))

# train examples --> inputs
train_examples_batch = tf.reshape(train_examples_batch, [10,])
train_examples_batch

<tf.Tensor: id=21, shape=(10,), dtype=string, numpy=
array([b"Am I alone in feeling like Quora doesn't provide enough space for context in personal and/or introspective questions anymore?",
       b'Do elderly people feel or know when there getting ready to pass on?',
       b'What skills should I learn as a physiotherapist?',
       b'What are some hoaxes people still belive are true?',
       b'What percentage of IPOs have negative earnings?',
       b'What can I do to help my boyfriend deal with childhood rape?',
       b'Why is "The Black Cat" considered a goth themed story?',
       b'How can I check Skillselect Mailbox?',
       b'Why are women more able to get pregnant easier the younger they are?',
       b'Why are some Americans obsessed with their guns?'], dtype=object)>

In [5]:
# train examples --> labels
train_examples_batch = tf.reshape(train_labels_batch, [10,])
train_examples_batch

<tf.Tensor: id=23, shape=(10,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

In [7]:
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

In [8]:
# TF Hub embeddings
gnews = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
wiki = "https://tfhub.dev/google/Wiki-words-500-with-normalization/2"


In [9]:
hub_layer = hub.KerasLayer(gnews, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [10]:
hub_layer

<tensorflow_hub.keras_layer.KerasLayer at 0x7fad82c08550>

In [None]:
# output bias

In [None]:
# drop out layer

In [11]:
# building full model 

model = tf.keras.Sequential()
 
# specify layers
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [12]:
# model compile
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [13]:
# train model
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
history

<tensorflow.python.keras.callbacks.History at 0x7fad7b080668>

In [15]:
# evaluate 
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

511/511 - 3s - loss: 0.1419 - tp: 7715.0000 - fp: 4242.0000 - tn: 240821.0000 - fn: 8447.0000 - accuracy: 0.9514 - precision: 0.6452 - recall: 0.4774 - auc: 0.9242
loss: 0.142
tp: 7715.000
fp: 4242.000
tn: 240821.000
fn: 8447.000
accuracy: 0.951
precision: 0.645
recall: 0.477
auc: 0.924
