In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


###**Package Setup**

In [None]:
!pip install keras-bert

Collecting keras-bert
  Downloading https://files.pythonhosted.org/packages/e2/7f/95fabd29f4502924fa3f09ff6538c5a7d290dfef2c2fe076d3d1a16e08f0/keras-bert-0.86.0.tar.gz
Collecting keras-transformer>=0.38.0
  Downloading https://files.pythonhosted.org/packages/89/6c/d6f0c164f4cc16fbc0d0fea85f5526e87a7d2df7b077809e422a7e626150/keras-transformer-0.38.0.tar.gz
Collecting keras-pos-embd>=0.11.0
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.27.0
  Downloading https://files.pythonhosted.org/packages/e6/32/45adf2549450aca7867deccfa04af80a0ab1ca139af44b16bc669e0e09cd/keras-multi-head-0.27.0.tar.gz
Collecting keras-layer-normalization>=0.14.0
  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Collecting keras-position-wise-feed-forward>=0.6.0
  Downloading

###**Tensorflow Configuration**

We setup an environment variable for keras-bert to use tensorflow.python.keras

In [None]:
import os
os.environ['TF_KERAS'] = '1'    # Required to use tensorflow.python.keras with keras-bert

In [None]:
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

###**Settingup GPU**

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
    print('We will use the GPU:', device_name)
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0
We will use the GPU: /device:GPU:0


###**Load Train and Test Data**

In [None]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train_file.txt', sep='{}{}{}', engine = 'python')
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/test_file.txt", sep= '{}{}{}', engine = 'python')

#from sklearn.model_selection import train_test_split

train, val =  train,test

###**Download Pretrained BERT Model**
Here I have downloaded the Large Cased trained model

In [None]:
# Give -nc (--no-clobber) argument so that the file isn't downloaded multiple times 
!wget -nc https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip

--2020-12-01 18:16:15--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.212.128, 172.217.214.128, 108.177.111.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.212.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 404261442 (386M) [application/zip]
Saving to: ‘cased_L-12_H-768_A-12.zip’


2020-12-01 18:16:21 (67.2 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]



In [None]:
# Give -n argument so that existing files aren't overwritten 
!unzip -n cased_L-12_H-768_A-12.zip

Archive:  cased_L-12_H-768_A-12.zip
   creating: cased_L-12_H-768_A-12/
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: cased_L-12_H-768_A-12/vocab.txt  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: cased_L-12_H-768_A-12/bert_config.json  


- vocab.txt is a plain file listing vocabulary items 
- bert_config.json consists of model configuration in JSON format
- bert_model.ckpt.* consists of model checkpoint data with pretrained weights

In [None]:
bert_vocab_path = 'cased_L-12_H-768_A-12/vocab.txt'
bert_config_path = 'cased_L-12_H-768_A-12/bert_config.json'
bert_checkpoint_path = 'cased_L-12_H-768_A-12/bert_model.ckpt'    # suffixes not required


Make sure if the model we downloaded was case sensitive or not

In [None]:
model_is_cased = True

Shuffle the data to avoid bias

In [None]:
from sklearn.utils import shuffle

train = shuffle(train)
train.head()

Unnamed: 0,Label,Text
16937,__label__sl,I Think You Know What I Mean Lyrics Lyle Love...
932,__label__ob,When should we commemorate the centenary of cy...
8086,__label__pb,I was in Starbucks the other day catching up w...
3724,__label__ne,A water bear (Paramacrobiotus craterlaki). Sci...
6178,__label__ne,Wind Could Supply 1/10th of the World's Power ...


###**Load BERT Vocabulary**
A plain text file with one vocabulary item per line

In [None]:
vocabulary = []
with open(bert_vocab_path) as f:
    for i, line in enumerate(f):
        vocabulary.append(line.rstrip('\n'))    # rstrip to remove newline characters


# Print a list with every 500th vocabulary item
print(vocabulary[0::500])
print(len(vocabulary))

['[PAD]', 'щ', '吉', 'told', 'space', 'operations', 'proposed', 'Oxford', 'showing', 'domestic', 'mountains', 'commission', 'voices', 'associate', 'hills', 'Guide', 'relaxed', 'Page', 'Heights', 'singers', 'Interior', 'considers', 'facilitate', 'shouting', '1826', 'constitute', 'alter', 'clip', 'Into', 'Memory', 'ballad', 'Owens', 'Langdon', 'aquatic', 'stereo', 'Cass', 'Shock', '195', '##tec', '##sonic', 'attested', '##rdes', '1840s', '##90', 'Guys', '##rien', 'Munro', 'Ursula', 'mesh', 'diplomacy', 'Newmarket', '##oughs', 'synthesizers', 'Drugs', 'monstrous', '##ynamic', 'troll', '##ٹ']
28996


###**Load BERT Configuration File**
The configuration is just a json file so we use json.load from python json library. We wont actually need to use these configuration details directly (keras-bert takes care of them for us). Lets see what information is contained in the config file. 

In [None]:
from pprint import pprint    # pretty-printer for output
import json

with open(bert_config_path) as f:
    config = json.load(f)


# Print configuration contents
pprint(config)

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 28996}


We can see dropout probability, hidden size, number of hidden layers, vocabulary size and many other parameters above

###**Create BERT Tokenizer**

To create the tokenizer, we'll need mapping from vocabulary items to their integer indices. 

In [None]:
import random
# Create mapping from vocabulary items to their indices in the vocabulary
token_dict = { value: i for i, value in enumerate(vocabulary) }


# Print some random examples of the mapping
pprint(dict(random.choices(list(token_dict.items()), k=10)))

{'##ald': 18728,
 '##rts': 13245,
 '##tating': 24558,
 'Opening': 13902,
 'alley': 10959,
 'dedication': 13314,
 'forcibly': 23129,
 'gene': 5565,
 'shape': 3571,
 'werewolf': 14665}


We'll use the keras-bert Tokenizer for BERT tokenization. The implementation supports

- (Optional) lowercasing: Hello → hello
- Basic tokenization: Hello! → Hello !, multi-part → multi - part
- Wordpiece tokenization: comprehensively → comprehensive ##ly
- Adding special tokens: Sentence → [CLS] Sentence [SEP]
- Mapping to integer indices
- Generating segment sequence
- (Optional) padding and truncation to length

In the following example, notice how words not in the dictionary are broken up into subwords (with continuation parts starting with ##) and how unknown characters are mapped to a special unknown word token [UNK].



In [None]:
from keras_bert import Tokenizer


tokenizer = Tokenizer(token_dict, cased=model_is_cased)


# Let's test that out
for s in ['I am doing NLP thesis :) 汉']:
    print('Original string:', s)
    print('Tokenized:', tokenizer.tokenize(s))
    indices, segments = tokenizer.encode(s, max_len=20)    # max_len for padding and truncation
    print('Encoded:', indices)
    print('Segments:', segments)
    print('Decoded:', ' '.join(tokenizer.decode(indices)))
    print()

Original string: I am doing NLP thesis :) 汉
Tokenized: ['[CLS]', 'I', 'am', 'doing', 'NL', '##P', 'thesis', ':', ')', '汉', '[SEP]']
Encoded: [101, 146, 1821, 1833, 21239, 2101, 9593, 131, 114, 100, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Segments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded: I am doing NL ##P thesis : ) [UNK]



In [None]:
# max_le = 0
# i = 0
# for i, document in enumerate (train['Text'].values):
#   tokenized = tokenizer.tokenize(document)
#   document_length = len(tokenized)

#   if document_length > max_le:

#     max_le = document_length
#     i = i

# print(max)
# print(i)

In [None]:
#print(train['Text'].values[17587])
len(tokenizer.tokenize(train['Text'].values[17587]))

218

In [None]:
train.head()
train['Label'].values

###**Vectorize data**
Using Label Encoder to Vectorize the data

In [None]:
from sklearn.preprocessing import LabelEncoder

labels =train['Label'].values
label_encoder = LabelEncoder()    # Turns class labels into integers
Y = label_encoder.fit_transform(labels)

# Take note of how many unique labels there are in the data
num_labels = len(set(Y))


# Print out some examples
print('Number of unique labels:', num_labels)
print(type(labels), labels[:10])
print(type(Y), Y[:10])

Number of unique labels: 26
<class 'numpy.ndarray'> ['__label__sl' '__label__ob' '__label__pb' '__label__ne' '__label__ne'
 '__label__ne' '__label__ne' '__label__ne' '__label__ib' '__label__ib']
<class 'numpy.ndarray'> [21 13 14 12 12 12 12 12 10 10]


In [None]:
y_val = label_encoder.fit_transform(test['Label'].values)

Keep token indices and segment ids in separate numpy arrays. 

In [None]:
import numpy as np


train_token_indices, train_segment_ids = [], []  #bert tokenizer indices and their segment ids  (to separate sequences)
val_token_indices, val_segment_ids = [], []
for text in train['Text'].values:
    # tokenizer.encode() returns a sequence of token indices
    # and a sequence of segment IDs. BERT expects both as input,
    # even if the segments IDs are just all zeros (like here).
    ttid, tsid = tokenizer.encode(text, max_len=256)
    train_token_indices.append(ttid)
    train_segment_ids.append(tsid)
 
for text in test['Text'].values:
    # tokenizer.encode() returns a sequence of token indices
    # and a sequence of segment IDs. BERT expects both as input,
    # even if the segments IDs are just all zeros (like here).
    vtid, vsid = tokenizer.encode(text, max_len=256)
    val_token_indices.append(vtid)
    val_segment_ids.append(vsid)

# Format input as list of two numpy arrays
train_X = [np.array(train_token_indices), np.array(train_segment_ids)]
val_X = [np.array(val_token_indices), np.array(val_segment_ids)]


# Print some examples
# print('Token indices:')
# print(val_X[0][:2])
# print('Decoded:')
# for i in val_X[0][:2]:
#     print(tokenizer.decode(list(i)))
# print('Segment ids:')
# print(val_X[1][:2])
# print()
# print()

# print('Token indices:')
# print(train_X[0][:2])
# print('Decoded:')
# for i in train_X[0][:2]:
#     print(tokenizer.decode(list(i)))
# print('Segment ids:')
# print(train_X[1][:2])


In [None]:
# label_encode = {}
# for i, v  in enumerate(train['Label'].unique()):
#   label_encode[v] = i
# label_encode

In [None]:
# train['Label_enc'] = train['Label'].map(label_encode)
# val['Label_enc'] = val['Label'].map(label_encode)
# val.head()

In [None]:
from keras_bert import load_trained_model_from_checkpoint


pretrained_model = load_trained_model_from_checkpoint(
    config_file = bert_config_path,
    checkpoint_file = bert_checkpoint_path,
    training = False,
    trainable = True,
    seq_len = 256
)

In [None]:
# This is a keras model, so we can figure out what inputs it takes like so:
pretrained_model.inputs

In [None]:
# And similarly for outputs:
pretrained_model.outputs

In [None]:
#@title Print Model Summary

pretrained_model.summary()

NameError: ignored

In [None]:
# model.outputs is a list, here with a single item. Here
# pretrained_model.outputs[0] just grabs that item (the output tensor).
# Indxing that tensor with [:,0] gives the first position in the sequence
# for all elements in the batch (the `:`).
bert_out = pretrained_model.outputs[0][:,0]

print(bert_out)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
#num_labels = 26

dropout_layer = Dropout(.5, input_shape=(768,))(bert_out)
out = Dense(num_labels, activation='softmax')(dropout_layer)
model = Model(
    inputs=pretrained_model.inputs,
    outputs=[out]
)

In [None]:
from keras_bert import calc_train_steps, AdamWarmup


# Calculate the number of steps for warmup
total_steps, warmup_steps = calc_train_steps(
    num_example=len(train['Text'].values),
    batch_size=8,
    epochs=3,
    warmup_proportion=0.1,
)

optimizer = AdamWarmup(
    total_steps,
    warmup_steps,
    lr=0.00002,
    epsilon=1e-6,
    weight_decay=0.01,
    weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo']
)

In [None]:
from keras.metrics import sparse_categorical_accuracy
model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

In [None]:
# from tensorflow import keras
# model = keras.models.load_model('/content/drive/My Drive/Colab Notebooks/assets')
history = model.fit(
    train_X,
    Y,
    epochs=3,
    batch_size=8,
    validation_data= (val_X,y_val)
    
)

In [None]:
train_X

In [None]:
# os.chdir('/content/drive/My Drive/Colab Notebooks/')
# os.getcwd()
# !ls

In [None]:
# #!pip install numba 
# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [None]:

%matplotlib inline
import matplotlib.pyplot as plt



def plot_history(history):
    plt.plot(history.history['sparse_categorical_accuracy'],label="Training set accuracy")
    plt.plot(history.history['val_sparse_categorical_accuracy'],label="Validation set accuracy")
    plt.legend()
    plt.show()


plot_history(history)

In [None]:
model.evaluate(val_X,y_val)


In [None]:
val_X[:2]

In [None]:
a = model.predict([val_X[0][:5], val_X[1][:5]])
a

In [None]:
a.argmax(axis = 1)

In [None]:
y_val[:5]

In [None]:

[val_X[0][1], val_X[1][1]]

In [None]:
from sklearn import metrics
matrix = metrics.confusion_matrix(y_val)