In [34]:
# import packages
import pandas as pd
import boto3
import numpy as np

# tf
import tensorflow as tf
import tensorflow_hub as hub

# set working directory
import os
path = "/home/mriveralanas/projects/quora/"
os.chdir(path)

# user modules
import src.data.process_data as process_data


In [35]:
from __future__ import absolute_import, division, print_function, unicode_literals

# !pip install -q tensorflow-hub
# !pip install -q tensorflow-datasets

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.0.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


In [36]:
data = process_data.retrieve_training()

In [37]:
# Measure data imbalance 
    neg, pos = np.bincount(data['target'].values)
    total = neg + pos
    print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        total, pos, 100 * pos / total))

Examples:
    Total: 1306122
    Positive: 80810 (6.19% of total)



In [4]:
# Use a utility from sklearn to split and shuffle our dataset
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)


In [6]:
# Form np arrays of labels
train_labels = np.array(train_df.pop('target'))
val_labels = np.array(val_df.pop('target'))
test_labels = np.array(test_df.pop('target'))


In [7]:
# Form np arrays of features.
train_features = np.array(train_df).reshape((len(train_df,)))
val_features = np.array(val_df).reshape((len(val_df,)))
test_features = np.array(test_df).reshape((len(test_df,)))


In [8]:
# load to tf.data --> train
train_data = tf.data.Dataset.from_tensor_slices((train_features,train_labels))

# load to tf.data --> validation
validation_data = tf.data.Dataset.from_tensor_slices((val_features, val_labels))

# load to tf.data --> test
test_data = tf.data.Dataset.from_tensor_slices((test_features, test_labels))


In [10]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))

# reshape 
train_examples_batch = tf.reshape(train_examples_batch, [10,])

# batches --> note the shape
train_examples_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'How do I heal from being used by my ex for sex?',
       b'How do I stop giving a fuck about stuff?',
       b"Is it normal if I don't want to see my girlfriend sometimes?",
       b'What should I keep in mind when planning to move over to Mazatl\xc3\xa1n, Sinaloa?',
       b'Where is the highest age of consent for sexual relations?',
       b'Where can I get my cock sucked?',
       b'How do I prove: sinA/ (1+cosA) =cosecA-cotA?',
       b"Has parenting really gotten so bad, that parents turn a blind eye to the child's negative capabilities? Is it possible that some of the blame (at times) can be shifted to the parents?",
       b'Are oxidation reactions exothermic or endothermic? What are some examples of endothermic oxidation reactions?',
       b'Pain is a subjective sensation produced due to nerve damage. So, does eating spicy food cause possible or impending nerve damage?'],
      dtype=object)>

Build the model

three main architecture decisions:
1) how to represent the data (the text)
2) how many layers to use in the model 
3) how many **hidden** units to use for each layer 

Transfer Learning 
One way to represent the text is to convert sentences into embeddings vectors. We can use a pre-trained text embedding as the first layer, which will have three advantages:

1) we don't have to worry about text preprocessing,
2) we can benefit from transfer learning,
3) the embedding has a fixed size, so it's simpler to process.

https://blog.fastforwardlabs.com/2019/09/05/transfer-learning-from-the-ground-up.html






For this example we will use a pre-trained text embedding model from TensorFlow Hub called google/tf2-preview/gnews-swivel-20dim/1.

There are three other pre-trained models to test for the sake of this tutorial:

1) google/tf2-preview/gnews-swivel-20dim-with-oov/1 - same as google/tf2-preview/gnews-swivel-20dim/1, but with 2.5% vocabulary converted to OOV buckets. This can help if vocabulary of the task and vocabulary of the model don't fully overlap.

2) google/tf2-preview/nnlm-en-dim50/1 - A much larger model with ~1M vocabulary size and 50 dimensions.

3) google/tf2-preview/nnlm-en-dim128/1 - Even larger model with ~1M vocabulary size and 128 dimensions.

Let's first create a Keras layer that uses a TensorFlow Hub model to embed the sentences, and try it out on a couple of input examples. 

Note that no matter the length of the input text, the output shape of the embeddings is: (num_examples, embedding_dimension)

In [11]:
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

In [18]:
# TF Hub embeddings
gnews = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
wiki = "https://tfhub.dev/google/Wiki-words-500-with-normalization/2"


In [21]:
hub_layer = hub.KerasLayer(google_news, input_shape=[], 
                           dtype=tf.string, trainable=True)

OSError: https://code.google.com/archive/p/word2vec/ does not appear to be a valid module.

In [15]:
hub_layer

<tensorflow_hub.keras_layer.KerasLayer at 0x7f2a5ac006a0>

In [None]:
model = KeyedVectors.load_word2vec_format('~/projects/quora/data/external/wiki-news-300d-1M/wiki-news-300d-1M.vec')


In [None]:
hub_layer = hub.KerasLayer(model, input_shape=[],                            dtype=tf.string, trainable=True)

In [None]:
# output bias

In [None]:
# drop out layer

In [None]:
# building full model 

model = tf.keras.Sequential()
# initialize output bias 

model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()


In [None]:
# model compile
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
# train model
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

In [None]:
# evaluate 
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

In [None]:
data.head()

In [None]:
data.head(1)

In [None]:
data.columns

In [None]:
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

In [None]:
tf.reshape(train_data,((), ()))