This experiment has been conducted on google colab.

In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
!pip install bert-tensorflow



### Import Libraries

In [0]:
import logging
logging.disable(logging.CRITICAL)

import numpy as np
import pandas as pd

from bert import tokenization
import tensorflow as tf
import tensorflow_hub as hub

import warnings
warnings.filterwarnings("ignore")

### Load Data

In [0]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Feature Extraction and Sentiment Analysis BERT/Womens-Clothing-E-Commerce-Reviews.csv", 
                   encoding="utf-8")

In [5]:
data.head()

Unnamed: 0,reviews,sentiment
0,I had such high hopes for this dress and reall...,0
1,"I love tracy reese dresses, but this one is no...",0
2,Dress runs small esp where the zipper area run...,0
3,This dress is perfection! so pretty and flatte...,1
4,More and more i find myself reliant on the rev...,1


In [6]:
data.sentiment.value_counts()

1    5000
0    4000
Name: sentiment, dtype: int64

In [0]:
X = data.reviews
y = data.sentiment

### Data Partitioning

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0, stratify=y_val)

print("""Training shape: {}
Validation shape: {}
Testing shape: {}""".format(X_train.shape, X_val.shape, X_test.shape))

Training shape: (7200,)
Validation shape: (900,)
Testing shape: (900,)


### Load pretrained BERT module from tensorflow hub

In [0]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
bert_module = hub.Module(BERT_MODEL_HUB)

### Create Tokenizer

In [0]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                        tokenization_info["do_lower_case"]])
      
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

### Create BERT Inputs

In [0]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = [0] * len(input_ids)
    
    zero_padding = [0] * (max_seq_len-len(input_ids))

    input_ids.extend(zero_padding)
    input_mask.extend(zero_padding)
    segment_ids.extend(zero_padding)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

### Feature Extraction

In [0]:
def creat_BERT_embeddings(sentences, tokenizer, max_seq_len):
    """Create BERT Embeddings from tokenized Text

    Argument: A list of Sentences
    Return: A numpy array of BERT embeddings
    """
    input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, max_seq_len)

    ### SIGNATURE
    input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
    input_mask = tf.placeholder(dtype=tf.int32, shape=[None, None])
    segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])

    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    
    bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)

    pooled_embeddings = []
    # sequence_embeddings = []

    ### CREAT BATCH PROCESS
    input_ids_tensors = tf.compat.v1.data.Dataset.from_tensor_slices(input_ids_vals)
    input_mask_tensors = tf.compat.v1.data.Dataset.from_tensor_slices(input_mask_vals)
    segment_ids_tensors = tf.compat.v1.data.Dataset.from_tensor_slices(segment_ids_vals)
    dcombined = tf.compat.v1.data.Dataset.zip((input_ids_tensors, input_mask_tensors, segment_ids_tensors)).batch(512)

    iterator = tf.compat.v1.data.make_one_shot_iterator(dcombined)
    next_ele = iterator.get_next()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        try:
            while True:
                inputs = sess.run(next_ele)

                input_ids_vals = inputs[0]
                input_mask_vals = inputs[1]
                segment_ids_vals = inputs[2]
                out = sess.run(bert_outputs, feed_dict={input_ids: input_ids_vals,
                                                        input_mask: input_mask_vals, 
                                                        segment_ids: segment_ids_vals})

                pooled_embeddings.extend(out['pooled_output'].tolist())
                # sequence_embeddings.extend(out['sequence_output'].tolist())
        except tf.errors.OutOfRangeError:
            pass
    return np.array(pooled_embeddings)

In [13]:
%%time
X_train_embeddings = pd.DataFrame(creat_BERT_embeddings(sentences=X_train, tokenizer=tokenizer, max_seq_len=128)).add_prefix('Col_')
X_val_embeddings = pd.DataFrame(creat_BERT_embeddings(sentences=X_val, tokenizer=tokenizer, max_seq_len=128)).add_prefix('Col_')

CPU times: user 43.2 s, sys: 17.2 s, total: 1min
Wall time: 56.2 s


In [14]:
X_train_embeddings.head()

Unnamed: 0,Col_0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,Col_11,Col_12,Col_13,Col_14,Col_15,Col_16,Col_17,Col_18,Col_19,Col_20,Col_21,Col_22,Col_23,Col_24,Col_25,Col_26,Col_27,Col_28,Col_29,Col_30,Col_31,Col_32,Col_33,Col_34,Col_35,Col_36,Col_37,Col_38,Col_39,...,Col_728,Col_729,Col_730,Col_731,Col_732,Col_733,Col_734,Col_735,Col_736,Col_737,Col_738,Col_739,Col_740,Col_741,Col_742,Col_743,Col_744,Col_745,Col_746,Col_747,Col_748,Col_749,Col_750,Col_751,Col_752,Col_753,Col_754,Col_755,Col_756,Col_757,Col_758,Col_759,Col_760,Col_761,Col_762,Col_763,Col_764,Col_765,Col_766,Col_767
0,-0.659489,-0.46473,-0.957151,0.670922,0.945277,-0.298615,0.140124,0.212573,-0.861705,-0.9997,-0.605654,0.984324,0.948046,0.377068,0.884211,-0.497775,0.245334,-0.585299,0.073094,0.828752,0.664243,0.999999,-0.093136,0.314149,0.334059,0.993542,-0.779223,0.915568,0.843669,0.615497,-0.278391,0.118742,-0.990054,-0.158134,-0.984163,-0.97716,0.491682,-0.479324,0.058912,0.34722,...,0.180769,-0.245119,-0.294652,-0.367644,0.68581,-0.846489,-0.579735,-0.348997,0.609319,0.204738,0.999999,-0.857483,-0.876669,-0.571752,-0.386892,0.418586,-0.258973,-1.0,0.03063,-0.837558,0.699402,-0.771091,0.95684,-0.691685,-0.888869,-0.170056,0.751165,0.885666,-0.440122,-0.417689,0.632071,-0.130147,0.991678,0.745128,-0.032809,0.352095,0.702057,-0.953417,-0.630775,0.868778
1,-0.567957,-0.450326,-0.780838,0.408273,0.579154,-0.090489,0.404886,0.285569,-0.289359,-0.999791,-0.32513,0.806307,0.966448,0.221185,0.85777,-0.258594,0.619497,-0.625398,0.350537,0.707224,0.623491,0.999912,0.238879,0.364654,0.388217,0.784246,-0.573318,0.908454,0.889738,0.708217,-0.25357,0.229095,-0.988088,-0.245612,-0.871597,-0.989617,0.396455,-0.499493,-0.011006,0.184007,...,0.189193,-0.274724,-0.354285,-0.469935,0.679869,-0.551901,-0.607671,-0.321371,0.512774,0.241593,0.999916,-0.68818,-0.585825,-0.206715,-0.336971,0.532936,-0.00577,-1.0,0.172926,-0.3818,0.659361,-0.413921,0.699115,-0.083048,-0.899891,-0.401575,0.639334,0.531187,-0.429557,-0.039638,0.566836,0.550733,0.766385,0.746976,0.336601,0.401434,0.655184,-0.717869,-0.712284,0.847127
2,-0.778795,-0.34661,-0.902236,0.531263,0.782789,-0.197437,0.377878,0.10408,-0.851841,-0.999705,-0.538332,0.974979,0.949982,0.351784,0.865813,-0.371862,0.156277,-0.538143,0.218601,0.621527,0.62011,0.999985,0.027218,0.234407,0.413431,0.991349,-0.763888,0.890116,0.863652,0.659295,-0.198865,0.10773,-0.987395,-0.086717,-0.9516,-0.979777,0.386455,-0.403794,0.294576,0.305212,...,0.117183,-0.310559,-0.343416,-0.494404,0.74704,-0.752267,-0.510656,-0.408162,0.636763,0.078393,0.999989,-0.810404,-0.909096,-0.581506,-0.362381,0.397921,-0.180872,-1.0,0.206304,-0.714427,0.704334,-0.80921,0.85282,-0.577445,-0.87741,-0.120543,0.718491,0.840978,-0.46072,-0.380659,0.517358,-0.317557,0.988715,0.731808,0.293118,0.324444,0.60528,-0.817181,-0.652367,0.846241
3,-0.724027,-0.349603,-0.934234,0.500797,0.820521,-0.162249,0.585874,0.100884,-0.8077,-0.999935,-0.212136,0.953452,0.936226,0.366095,0.83588,-0.501214,-0.002248,-0.53931,0.040113,0.471419,0.549526,0.999991,0.092836,0.117571,0.143752,0.98458,-0.592104,0.819739,0.889904,0.67803,-0.298469,-0.01576,-0.97788,-0.044367,-0.958167,-0.980254,0.26136,-0.599832,0.103838,0.169739,...,0.058251,-0.189173,-0.25944,-0.349394,0.798764,-0.77973,-0.398215,-0.379223,0.706953,0.130084,0.999987,-0.835167,-0.816031,-0.365787,-0.243654,0.213036,-0.33172,-1.0,0.036632,-0.532156,0.711096,-0.792048,0.906653,-0.559458,-0.919063,0.079833,0.594124,0.763141,-0.452002,-0.573433,0.602281,0.379347,0.974111,0.61126,0.600111,0.261441,0.68641,-0.902852,-0.473035,0.827135
4,-0.908149,-0.56947,-0.996126,0.915152,0.963255,-0.250603,0.860438,0.349019,-0.976684,-0.999993,-0.932177,0.991449,0.935429,0.855076,0.907463,-0.854554,-0.213438,-0.669507,0.324293,-0.002271,0.755913,1.0,-0.582461,0.392748,0.607291,0.999589,-0.922741,0.927927,0.92503,0.699714,-0.657172,0.297713,-0.984647,-0.420417,-0.995937,-0.989566,0.702664,-0.730054,0.015209,0.021079,...,0.662394,-0.561816,-0.540575,-0.710651,0.8649,-0.723948,-0.786676,-0.689634,0.822122,0.373334,1.0,-0.968438,-0.993598,-0.719559,-0.580187,0.548467,-0.672335,-1.0,0.234993,-0.940505,0.933753,-0.96516,0.986422,-0.9136,-0.977074,-0.352261,0.725661,0.959849,-0.587544,-0.86789,0.786355,-0.857022,0.998797,0.826229,-0.763918,-0.327413,0.773089,-0.973204,-0.716175,0.904148


### Create Sentiment Analysis Model

In [0]:
model = tf.keras.models.Sequential([tf.keras.layers.Dense(units=768, input_dim=768, activation=tf.nn.tanh),
                                    tf.keras.layers.Dense(units=512, activation=tf.nn.tanh),
                                    tf.keras.layers.Dense(units=512, activation=tf.nn.tanh),
                                    tf.keras.layers.Dense(units=2, activation=tf.nn.softmax)])

In [0]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [56]:
history = tf.keras.callbacks.History()

y_train_k = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_val_k = tf.keras.utils.to_categorical(y_val, num_classes=2)

model.fit(X_train_embeddings, y_train_k, 
          validation_data=(X_val_embeddings, y_val_k), 
          epochs=3,
          batch_size=2000,
          callbacks=[history],
          shuffle=True,
          verbose=True)

Train on 7200 samples, validate on 900 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f28ed7df470>

In [33]:
%%time
X_test_embeddings = pd.DataFrame(creat_BERT_embeddings(sentences=X_test, tokenizer=tokenizer, max_seq_len=128)).add_prefix('Col_')

CPU times: user 7.08 s, sys: 2.01 s, total: 9.09 s
Wall time: 8.57 s


In [0]:
y_test_k = tf.keras.utils.to_categorical(y_test, num_classes=2)
loss, test_accuracy = model.evaluate(X_test_embeddings, y_test_k, verbose=0)

### Prediction

In [58]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(np.array(X_test_embeddings))
acc = accuracy_score(y_test_k.argmax(axis=1), y_pred.argmax(axis=1))
print("Prediction Accuracy:", round(acc, 3))

Prediction Accuracy: 0.861


### Result & Analysis

In [0]:
# for visualization
his_train_acc = history.history['acc']
his_val_acc = history.history['val_acc']

his_train_loss = history.history['loss']
his_val_loss = history.history['val_loss']

train_accuracy = round(history.history['acc'][-1], 3)
val_accuracy = round(history.history['val_acc'][-1], 3)
test_accuracy = round(test_accuracy, 3)

In [60]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ---------------------- Sub-Plots --------------------
fig = make_subplots(
    rows=2, cols=2,
    shared_xaxes=False,
    vertical_spacing=0.08,
    horizontal_spacing=0.07,
    subplot_titles=("Results", "Unique Categories Distribution", "Accuracy Vs Epochs", "Loss Vs Epochs"),
    specs=[[{"type": "table"}, {"type": "pie"}],
           [{"type": "scatter"}, {"type": "scatter"}]]
)

# ---------------------- Table -----------------------
fig.add_trace(
    go.Table(
        header=dict(
            values=["", "Accuracy"],
            font=dict(size=15, color='white'),
            align="left",
            height = 30,
            fill={'color':'rgb(55, 83, 109)'}),
        cells=dict(
            values=[["Training", "Validation", "Test"],
                    [str(train_accuracy), str(val_accuracy), str(test_accuracy)]],
            font=dict(size=15),
            align = "left",
            height = 25,
            # fill = {'color':'red'}
            )
    ),
    row = 1, col = 1)

# ---------------------- Pie -------------------------
fig.add_trace(
      go.Pie(labels=['Positive', 'Negative'],
             values=[5000, 4000], 
             name="Sentiment", 
             hole=0.5, 
            #  pull=[0.2, 0, 0, 0, 0, 0, 0, 0, 0],
             automargin=True),
    row=1, col=2)

# ---------------------- Line Plot -------------------
fig.add_trace(
    go.Scatter(x=list(range(1,len(his_train_acc)+1)),
               y=his_train_acc,
               name="Training",
               line=dict(color="darkorange")),
    row=2, col = 1)

fig.add_trace(
    go.Scatter(x=list(range(1,len(his_val_acc)+1)),
               y=his_val_acc,
               name="Validation",
               line=dict(color="seagreen")),
    row=2, col = 1)

fig.update_xaxes(title_text="Epochs", showgrid=True, row=2, col=1)
fig.update_yaxes(title_text="Accuracy", showgrid=True, row=2, col=1)

# ----------------------- Line Plot -------------------
fig.add_trace(
    go.Scatter(x=list(range(1,len(his_train_loss)+1)),
               y=his_train_loss,
               name="Training",
               line=dict(color="darkorange"),
               showlegend=False),
    row=2, col = 2)

fig.add_trace(
    go.Scatter(x=list(range(1,len(his_val_loss)+1)),
               y=his_val_loss,
               name="Validation",
               line=dict(color="seagreen"),
               showlegend=False),
    row=2, col = 2)

fig.update_xaxes(title_text="Epochs", showgrid=True, row=2, col=2)
fig.update_yaxes(title_text="Loss", showgrid=True, row=2, col=2)

# Set title
fig.update_layout(
    # template="plotly_dark",
    title_text="Sentiment Analysis Result"
    # height=800,
    # showlegend=False
)

fig.show()