In [1]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting matplotlib==3.0.3
  Downloading matplotlib-3.0.3-cp37-cp37m-manylinux1_x86_64.whl (13.0 MB)
[K     |████████████████████████████████| 13.0 MB 6.6 MB/s 
Collecting tensorflow==1.15
  Downloading tensorflow-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl (412.3 MB)
[K     |████████████████████████████████| 412.3 MB 23 kB/s 
Collecting bert-tensorflow==1.0.1
  Downloading bert_tensorflow-1.0.1-py2.py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.9 MB/s 
Collecting bentoml
  Downloading bentoml-1.0.2-py3-none-any.whl (774 kB)
[K     |████████████████████████████████| 774 kB 75.5 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Collecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 65.2 MB/s 
Collecting tensorflow-estimator==1.15.1
  Downloading t

In [2]:
!python config.py

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
pip install tensorflow-gpu==1.15

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu==1.15
  Downloading tensorflow_gpu-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl (411.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.5/411.5 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-1.15.0
[0m

In [4]:
# install packages

import os
import re
import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [5]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3160872522097742575
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 2097244281006896592
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 9276447438144457617
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15964005991
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3522835478767124150
physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"
]


In [6]:
import bert
# import python modules defined by BERT
from bert import optimization
from bert import run_classifier
from bert import tokenization




In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
import logging, sys
logging.disable(sys.maxsize)

In [9]:
## import dataset

train = pd.read_csv('dreaddit-train.csv', encoding = "ISO-8859-1")
test = pd.read_csv('dreaddit-test.csv', encoding = "ISO-8859-1")

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [10]:
# transform dataset into a format understood by BERT
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [11]:
# Load a vocabulary file and lowercasing information directly from the BERT tf hub module

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                tokenization_info["do_lower_case"]])

    return bert.tokenization.FullTokenizer(vocab_file=vocab_file, 
                                    do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [12]:
# Set the maximum sequence length. 
def get_max_len(text):
    max_len = 0
    for i in range(len(train)):
        if len(text.iloc[i]) > max_len:
            max_len = len(text.iloc[i])
    return max_len

temp = train.text.str.split(' ')
max_len = get_max_len(temp)

In [13]:
MAX_SEQ_LENGTH = max_len
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, 
                                                                  label_list, 
                                                                  MAX_SEQ_LENGTH, 
                                                                  tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, 
                                                                 label_list, 
                                                                 MAX_SEQ_LENGTH, 
                                                                 tokenizer)

In [14]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(BERT_MODEL_HUB, trainable=True)
    bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
    bert_outputs = bert_module(inputs=bert_inputs, signature="tokens", as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", 
                                  [num_labels], 
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)

In [15]:
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
        # TRAIN and EVAL
        if not is_predicting:
            (loss, predicted_labels, log_probs) = create_model(is_predicting, 
                                                               input_ids, 
                                                               input_mask, 
                                                               segment_ids, 
                                                               label_ids, 
                                                               num_labels)

            train_op = bert.optimization.create_optimizer(loss, 
                                                          learning_rate, 
                                                          num_train_steps, 
                                                          num_warmup_steps, 
                                                          use_tpu=False)

            # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(label_ids, predicted_labels)
                auc = tf.metrics.auc(label_ids, predicted_labels)
                recall = tf.metrics.recall(label_ids, predicted_labels)
                precision = tf.metrics.precision(label_ids, predicted_labels) 
                true_pos = tf.metrics.true_positives(label_ids, predicted_labels)
                true_neg = tf.metrics.true_negatives(label_ids, predicted_labels)   
                false_pos = tf.metrics.false_positives(label_ids, predicted_labels)  
                false_neg = tf.metrics.false_negatives(label_ids, predicted_labels)

                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(is_predicting, 
                                                         input_ids, 
                                                         input_mask, 
                                                         segment_ids, 
                                                         label_ids, 
                                                         num_labels)

            predictions = {'probabilities': log_probs, 'labels': predicted_labels}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn

In [28]:
# Compute train and warmup steps from batch size
BATCH_SIZE = 24
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [29]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [30]:
# Specify outpit directory and number of checkpoint steps to save
OUTPUT_DIR = 'output'

run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR,
                                    save_summary_steps=SAVE_SUMMARY_STEPS,
                                    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [31]:
model_fn = model_fn_builder(num_labels=len(label_list), 
                            learning_rate=LEARNING_RATE,
                            num_train_steps=num_train_steps,
                            num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(model_fn=model_fn,
                                   config=run_config,
                                   params={"batch_size": BATCH_SIZE})


In [32]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(features=train_features,
                                                      seq_length=MAX_SEQ_LENGTH,
                                                      is_training=True,
                                                      drop_remainder=False)


In [33]:
print(f'Beginning Training!')
current_time = datetime.now()

# train the model 
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

print("Training took time ", datetime.now() - current_time)

Beginning Training!
Training took time  0:05:57.412266


In [34]:
# check the test result
test_input_fn = run_classifier.input_fn_builder(features=test_features,
                                                seq_length=MAX_SEQ_LENGTH,
                                                is_training=False,
                                                drop_remainder=False)

In [35]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

{'auc': 0.80218756,
 'eval_accuracy': 0.8027972,
 'f1_score': 0.81124485,
 'false_negatives': 66.0,
 'false_positives': 75.0,
 'global_step': 354,
 'loss': 0.74519736,
 'precision': 0.8015873,
 'recall': 0.8211382,
 'true_negatives': 271.0,
 'true_positives': 303.0}

In [36]:
def predict(in_sentences):
    labels = ["non-stress", "stress"]
    labels_idx = [0, 1]
    input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label

    input_features = run_classifier.convert_examples_to_features(input_examples, 
                                                                 labels_idx, 
                                                                 MAX_SEQ_LENGTH, 
                                                                 tokenizer)
    
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, 
                                                       seq_length=MAX_SEQ_LENGTH, 
                                                       is_training=False, 
                                                       drop_remainder=False)

    predictions = estimator.predict(predict_input_fn)
    return [{"text": sentence, "confidence": list(prediction['probabilities']), "labels": labels[prediction['labels']]}
            for sentence, prediction in zip(in_sentences, predictions)]

In [37]:
pred_sentences = ["It's Friday! We wish you a nice start into the weekend!",
"Deep breathing exercises are very relaxing. It can also relieve the symptoms of stress and anxiety.",
"Do you like fruits? I like so much! Be Happy, Keep Smiling!"
]

In [39]:
predictions = predict(pred_sentences)
predictions

[{'confidence': [-0.0051890453, -5.2637887],
  'labels': 'non-stress',
  'text': "It's Friday! We wish you a nice start into the weekend!"},
 {'confidence': [-0.0040529976, -5.510336],
  'labels': 'non-stress',
  'text': 'Deep breathing exercises are very relaxing. It can also relieve the symptoms of stress and anxiety.'},
 {'confidence': [-0.0058140685, -5.150384],
  'labels': 'non-stress',
  'text': 'Do you like fruits? I like so much! Be Happy, Keep Smiling!'}]