In [1]:
# import packages
import pandas as pd
import boto3
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_datasets as tfds

# using this tutorial 
# https://www.tensorflow.org/tutorials/keras/text_classification_with_hub

# uses tf.keras, a high-level API to build and train models in TensorFlow, and TensorFlow Hub, a library and platform for transfer learning

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

# !pip install -q tensorflow-hub
# !pip install -q tensorflow-datasets

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.0.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


In [None]:
# bucket = "quora-questions"
# file_name = "data/train.csv"

# s3 = boto3.client('s3') 
# # 's3' is a key word. create connection to S3 using default config and all buckets within S3

# obj = s3.get_object(Bucket= bucket, Key= file_name) 
# # get object and file (key) from bucket


In [3]:
# load data
# load data 
data = pd.read_csv("~/projects/quora_data/train.csv")
data = data[['target', 'question_text']]
target = data.pop("target")


In [26]:

# train test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=42)

# load to tf.data --> train
train_data = tf.data.Dataset.from_tensor_slices((X_train.values.reshape((875101, )),y_train.values))


# load to tf.data --> validation
validation_data = tf.data.Dataset.from_tensor_slices((X_test.values.reshape((431021, )), y_test.values))

In [27]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))


# reshape 
train_examples_batch = tf.reshape(train_examples_batch, [10,])

# batches --> note the shape
train_examples_batch

<tf.Tensor: id=11417, shape=(10,), dtype=string, numpy=
array([b'What are the differences between the Indian work culture and the UK work culture?',
       b'Is this English sentence correct and not weird?',
       b'How much of a "leg up" would the Cornell human ecology be for a student? Worth the amount of money it would cost?',
       b'Will Trump get away with all of his lies and blatantly corrupt behavior?',
       b'What is the meaning of freedom of speech for RSS & BJP person?',
       b'What is the altitude of Genting Highlands?',
       b'Is GPA useful for getting better universities through GRE?',
       b'Can you compare Pakistani with pigs?',
       b'How pashmina shawl is formed?', b'How is murder morally wrong?'],
      dtype=object)>

Build the model

three main architecture decisions:
1) how to represent the data (the text)
2) how many layers to use in the model 
3) how many **hidden** units to use for each layer 

Transfer Learning 
One way to represent the text is to convert sentences into embeddings vectors. We can use a pre-trained text embedding as the first layer, which will have three advantages:

1) we don't have to worry about text preprocessing,
2) we can benefit from transfer learning,
3) the embedding has a fixed size, so it's simpler to process.

https://blog.fastforwardlabs.com/2019/09/05/transfer-learning-from-the-ground-up.html






For this example we will use a pre-trained text embedding model from TensorFlow Hub called google/tf2-preview/gnews-swivel-20dim/1.

There are three other pre-trained models to test for the sake of this tutorial:

1) google/tf2-preview/gnews-swivel-20dim-with-oov/1 - same as google/tf2-preview/gnews-swivel-20dim/1, but with 2.5% vocabulary converted to OOV buckets. This can help if vocabulary of the task and vocabulary of the model don't fully overlap.

2) google/tf2-preview/nnlm-en-dim50/1 - A much larger model with ~1M vocabulary size and 50 dimensions.

3) google/tf2-preview/nnlm-en-dim128/1 - Even larger model with ~1M vocabulary size and 128 dimensions.

Let's first create a Keras layer that uses a TensorFlow Hub model to embed the sentences, and try it out on a couple of input examples. 

Note that no matter the length of the input text, the output shape of the embeddings is: (num_examples, embedding_dimension)

In [28]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [29]:
# building full model 

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [30]:
# model compile
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [31]:
# train model
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [32]:
# evaluate 
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

NameError: name 'test_data' is not defined

In [None]:
data.head()

In [None]:
data.head(1)

In [None]:
data.columns

In [None]:
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

In [None]:
tf.reshape(train_data,((), ()))