In [1]:
# import packages
import pandas as pd
import boto3
import numpy as np

# sklearn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
import tensorflow_hub as hub

# import tensorflow_datasets as tfds

# using this tutorial 
# https://www.tensorflow.org/tutorials/keras/text_classification_with_hub
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

# uses tf.keras, a high-level API to build and train models in TensorFlow, and TensorFlow Hub, a library and platform for transfer learning

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

# !pip install -q tensorflow-hub
# !pip install -q tensorflow-datasets

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.0.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


In [9]:
bucket = "quora-questions"
file_name = "data/train.csv"

s3 = boto3.client('s3') 
# 's3' is a key word. create connection to S3 using default config and all buckets within S3

obj = s3.get_object(Bucket= bucket, Key= file_name) 
# get object and file (key) from bucket
data = pd.read_csv(obj['Body'])


In [10]:

# load data 
# data = pd.read_csv("~/projects/quora_data/train.csv")
data = data[['target', 'question_text']]
# target = data.pop("target")


In [11]:
neg, pos = np.bincount(target)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 1306122
    Positive: 80810 (6.19% of total)



In [28]:
# Use a utility from sklearn to split and shuffle our dataset
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Form np arrays of labels
train_labels = np.array(train_df.pop('target'))
val_labels = np.array(val_df.pop('target'))
test_labels = np.array(test_df.pop('target'))

# Form np arrays of features.
train_features = np.array(train_df).reshape((len(train_features,)))
val_features = np.array(val_df).reshape((len(val_features,)))
test_features = np.array(test_df).reshape((len(test_features,)))


In [29]:
# load to tf.data --> train
train_data = tf.data.Dataset.from_tensor_slices((train_features,train_labels))

# load to tf.data --> validation
validation_data = tf.data.Dataset.from_tensor_slices((val_features, val_labels))

# load to tf.data --> test
test_data = tf.data.Dataset.from_tensor_slices((test_features, test_labels))


In [44]:
train_labels

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))

# reshape 
train_examples_batch = tf.reshape(train_examples_batch, [10,])

# batches --> note the shape
train_examples_batch

<tf.Tensor: id=45, shape=(10,), dtype=string, numpy=
array([b'How do I heal from being used by my ex for sex?',
       b'How do I stop giving a fuck about stuff?',
       b"Is it normal if I don't want to see my girlfriend sometimes?",
       b'What should I keep in mind when planning to move over to Mazatl\xc3\xa1n, Sinaloa?',
       b'Where is the highest age of consent for sexual relations?',
       b'Where can I get my cock sucked?',
       b'How do I prove: sinA/ (1+cosA) =cosecA-cotA?',
       b"Has parenting really gotten so bad, that parents turn a blind eye to the child's negative capabilities? Is it possible that some of the blame (at times) can be shifted to the parents?",
       b'Are oxidation reactions exothermic or endothermic? What are some examples of endothermic oxidation reactions?',
       b'Pain is a subjective sensation produced due to nerve damage. So, does eating spicy food cause possible or impending nerve damage?'],
      dtype=object)>

Build the model

three main architecture decisions:
1) how to represent the data (the text)
2) how many layers to use in the model 
3) how many **hidden** units to use for each layer 

Transfer Learning 
One way to represent the text is to convert sentences into embeddings vectors. We can use a pre-trained text embedding as the first layer, which will have three advantages:

1) we don't have to worry about text preprocessing,
2) we can benefit from transfer learning,
3) the embedding has a fixed size, so it's simpler to process.

https://blog.fastforwardlabs.com/2019/09/05/transfer-learning-from-the-ground-up.html






For this example we will use a pre-trained text embedding model from TensorFlow Hub called google/tf2-preview/gnews-swivel-20dim/1.

There are three other pre-trained models to test for the sake of this tutorial:

1) google/tf2-preview/gnews-swivel-20dim-with-oov/1 - same as google/tf2-preview/gnews-swivel-20dim/1, but with 2.5% vocabulary converted to OOV buckets. This can help if vocabulary of the task and vocabulary of the model don't fully overlap.

2) google/tf2-preview/nnlm-en-dim50/1 - A much larger model with ~1M vocabulary size and 50 dimensions.

3) google/tf2-preview/nnlm-en-dim128/1 - Even larger model with ~1M vocabulary size and 128 dimensions.

Let's first create a Keras layer that uses a TensorFlow Hub model to embed the sentences, and try it out on a couple of input examples. 

Note that no matter the length of the input text, the output shape of the embeddings is: (num_examples, embedding_dimension)

In [36]:
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

In [37]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [None]:
# output bias

In [None]:
# drop out layer

In [38]:
# building full model 

model = tf.keras.Sequential()
# initialize output bias 

model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [45]:
# model compile
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [40]:
# train model
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
# evaluate 
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

511/511 - 2s - loss: 0.1436 - accuracy: 0.9510
loss: 0.144
accuracy: 0.951


In [None]:
data.head()

In [None]:
data.head(1)

In [None]:
data.columns

In [None]:
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

In [None]:
tf.reshape(train_data,((), ()))