## OutdoorSeating BERT

To adapt to a different attribute of interest, look out for `#change attribute` and replace with attribute name (ie. "OutdoorSeating") -- spelling must be correct! 

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import BertTokenizer
import tensorflow as tf

In [3]:
reviews = pd.read_csv("OutdoorSeating_>49.csv")

In [4]:
reviews.head(3)

Unnamed: 0,text,OutdoorSeating
0,"['Very relax friendly environment, the sandwic...",1
1,['Fun visit. Pizza crust was firm and crunchy....,1
2,['Great new E Milton Square spot - love sittin...,1


In [5]:
reviews.OutdoorSeating.value_counts()

1    2957
0    2957
Name: OutdoorSeating, dtype: int64

In [6]:
texts = reviews["text"]
labels = reviews["OutdoorSeating"] #change attribute

In [7]:
texts[0:6]

0    ['Very relax friendly environment, the sandwic...
1    ['Fun visit. Pizza crust was firm and crunchy....
2    ['Great new E Milton Square spot - love sittin...
3    ["Customer service was very good; the employee...
4    ["Delicious food and friendly service.  We had...
5    ["Cedar Hills has been ready for more elevated...
Name: text, dtype: object

In [8]:
labels[0:6]

0    1
1    1
2    1
3    1
4    1
5    1
Name: OutdoorSeating, dtype: int64

In [9]:
max_length = 512 #max token length BERT will take in according to paper https://arxiv.org/pdf/1905.05583.pdf
batch_size = 6

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def convert_to_feature(review):
    """one-stop step to tokenize, WordPiece vector map, add special tokens,
    and truncate reviews longer than max length"""
    return tokenizer.encode_plus(review,
                                add_special_tokens = True, #add [CLS] and [SEP] tokens
                                truncation = True, 
                                padding = "max_length", #add [PAD] tokens for reviews shorter than max_length
                                return_attention_mask = True, #add attention mask to not focus on [PAD]
                                )

In [11]:
convert_to_feature("This is a test")

{'input_ids': [101, 2023, 2003, 1037, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Training Dataset
### x_train

In [12]:
def encode_texts(texts):
    """create lists to build final TensorFlow dataset"""
    
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    i = 0
    
    for review in texts:

        bert_input = convert_to_feature(review)

        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        i += 1
        
    bert_inputs = np.array([input_ids_list, attention_mask_list, token_type_ids_list])
        
    return bert_inputs

In [13]:
x_train_1 = encode_texts(texts)

In [14]:
x_train_1.shape

(3, 5914, 512)

In [15]:
x_train_1[0].shape

(5914, 512)

In [16]:
x_train_1[0][0]

array([  101,  1031,  1005,  2200,  9483,  5379,  4044,  1010,  1996,
       22094,  2024,  2307,  2003,  2066,  2042,  2067,  2188,  1012,
        1996,  2047,  3295,  2003,  2126,  7046,  2061,  2003,  2428,
       22445,  2061,  2111,  2064,  2835,  1998,  5959,  2037,  7954,
        1012,  2023,  2003,  2026,  3962,  2320,  1037,  2733,  2005,
        6350,  1005,  1010,  1005,  2028,  1997,  2026,  5440,  3182,
        2000,  2175,  2005,  6350,   999,  2027,  4423,  1998,  2027,
        7597,  2205,  1012,  2021,  3452,  2204,  2173,  1010,  2200,
        2714,  2000,  5984, 14735,  2078,  2806, 16673,  1012,  1005,
        1010,  1000,  2023,  2003,  2026,  5440,  2173,  2000,  4521,
        2043,  1045,  1005,  1049,  2006,  2023,  2217,  1997,  2237,
         999,  3251,  2009,  1005,  1055,  2005,  6350,  1010,  6265,
        2030,  2130, 18064,  1010,  2673, 23651,  2026, 10908,  1012,
        1996,  7861,  9739,  8447,  2015,  2024,  2026,  5440,  1012,
        1996,  4825,

### y_train

In [17]:
labels.shape[0]

5914

In [18]:
y_train_1 = np.array(labels)
y_train_1.shape

(5914,)

## Validation Dataset

In [29]:
training_examples

array([1, 1, 1, ..., 0, 1, 1])

In [46]:
numSentences = len(x_train_1[0])
np.random.seed(0)
training_examples = np.random.binomial(1, 0.8, numSentences)

trainSentence_ids = []
trainMasks = []
trainSequence_ids = []

testSentence_ids = []
testMasks = []
testSequence_ids = []

Labels_train =[]
Labels_test = []


for example in range(numSentences):
    if training_examples[example] == 1:
        trainSentence_ids.append(x_train_1[0][example])
        trainMasks.append(x_train_1[1][example])
        trainSequence_ids.append(x_train_1[2][example])
        Labels_train.append(y_train_1[example])
    else:
        testSentence_ids.append(x_train_1[0][example])
        testMasks.append(x_train_1[1][example])
        testSequence_ids.append(x_train_1[2][example])
        Labels_test.append(y_train_1[example])
        
X_train = np.array([trainSentence_ids,trainMasks,trainSequence_ids])
X_test = np.array([testSentence_ids,testMasks,testSequence_ids])

Y_train = np.array(Labels_train).astype('float32').reshape((-1,1))
Y_test = np.array(Labels_test).astype('float32').reshape((-1,1))

X_train = [X_train[0], X_train[1], X_train[2]]
X_test = [X_test[0], X_test[1], X_test[2]]

## BERT Modeling

In [21]:
from transformers import TFBertModel

### BERT base

In [26]:
def BERT_model(max_input_length, train_layers, optimizer):
    """
    variables:
        max_input_length: number of tokens (max_length + 1)
        train_layers: number of layers to be retrained
        optimizer: optimizer to be used
    
    returns: model
    """
    
    in_id = tf.keras.layers.Input(shape=(max_input_length,), dtype='int32', name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_input_length,), dtype='int32', name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_input_length,), dtype='int32', name="segment_ids")
    
    
    bert_inputs = [in_id, in_mask, in_segment]
       
    # Note: Bert layer from Hugging Face returns two values: sequence ouput, and pooled output. Here, we only want
    # the former. (See https://huggingface.co/transformers/model_doc/bert.html#tfbertmodel) 

    bert_layer = TFBertModel.from_pretrained('bert-base-uncased')
    bert_sequence = bert_layer(bert_inputs)[0]
    
    print('Let us check the shape of the BERT layer output:', bert_sequence)
    
    dense = tf.keras.layers.Dense(256, activation='relu', name='dense')(bert_sequence)
    dense = tf.keras.layers.Dropout(rate=0.1)(dense)

    dense1 = tf.keras.layers.Dense(64, activation='relu', name='dense1')(dense)
    dense1 = tf.keras.layers.Dropout(rate=0.1)(dense1)
    
    pred = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(dense1) # binary activation output
  
    print('pred: ', pred)    
      
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    
    return model

In [47]:
model = BERT_model(
    max_length, 
    train_layers = 0, 
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.005))

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Let us check the shape of the BERT layer output: KerasTensor(type_spec=TensorSpec(shape=(None, 512, 768), dtype=tf.float32, name=None), name='tf_bert_model_2/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model_2'")
pred:  KerasTensor(type_spec=TensorSpec(shape=(None, 512, 1), dtype=tf.float32, name=None), name='classifier/Sigmoid:0', description="created by layer 'classifier'")
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 512)]        0           []                               
                                                     

In [48]:
X_train[0].shape

(4739, 512)

In [49]:
Y_train.shape

(4739, 1)

In [50]:
X_test[0].shape

(1175, 512)

In [51]:
Y_test.shape

(1175, 1)

In [52]:
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
