In [1]:
!pip install -q tf-nightly

[K     |████████████████████████████████| 390.3MB 39kB/s 
[K     |████████████████████████████████| 460kB 47.6MB/s 
[K     |████████████████████████████████| 10.2MB 38.4MB/s 
[?25h

In [2]:
!pip install -q tf-models-nightly

[K     |████████████████████████████████| 1.0MB 2.7MB/s 
[K     |████████████████████████████████| 1.1MB 18.5MB/s 
[K     |████████████████████████████████| 36.6MB 1.3MB/s 
[K     |████████████████████████████████| 358kB 47.8MB/s 
[K     |████████████████████████████████| 276kB 45.4MB/s 
[K     |████████████████████████████████| 174kB 48.0MB/s 
[K     |████████████████████████████████| 102kB 10.7MB/s 
[?25h  Building wheel for pyyaml (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


In [3]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [4]:
# configuration files for BERT model
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [6]:
hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"

In [7]:
# setting up tokenizer and model
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))

def encode_sentence(s, tokenizer):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
  #  print(tokens)
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
  num_examples = len(glue_dict["sentence1"])
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
      for s in np.array(glue_dict["sentence1"])])
  sentence2 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
       for s in np.array(glue_dict["sentence2"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs


# Build the model
# The first step is to download the configuration for the pre-trained model.
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=6)

checkpoint = tf.train.Checkpoint(model=bert_encoder)
checkpoint.restore(
    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()

Vocab size: 30522


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5e78ffe898>

In [8]:
# importing data and make it in a required format
import random
import pandas as pd

df = pd.read_csv('drive/My Drive/Training Data/data.csv', usecols=['Utterance', 'Intent'], engine='python')
data = list()
# print(raw_data)
for utr, inte in zip(df["Utterance"].tolist(), df["Intent"].tolist()):
    data.append((utr, inte))
random.shuffle(data)
# Converting into numpy array
from numpy import array
data = array(data)

In [9]:
# splitting data into k-folds
from sklearn.model_selection import KFold, StratifiedKFold
import operator

kfold = KFold(5, True, 1)

trainData = []
testData = []
i = 1
for train, test in kfold.split(data):
    print("Fold: "+str(i))
    i+=1
    trainData = data[train].tolist()
    testData = data[test].tolist()
    tr = [] # list of tuple train data
    te = [] # list of tuple test data
    for record1 in trainData:
      tr.append((record1[0], record1[1]))
    for record2 in testData:
      te.append((record2[0],record2[1]))
      # Sorting a list intent wise
    tr.sort(key=operator.itemgetter(1))
    # To BERT format
    training_data = {"train": {"label": [], "sentence1": [], "sentence2": []}}
    testing_data = {"test": {"label": [], "sentence1": [], "sentence2": []}}
    labels_map = {}
   
    for entry in tr:
      labels_map[entry[1]] = len(labels_map) - 1
    # print(labels_map)
    for entry in tr:
      training_data["train"]["label"].append(labels_map[entry[1]])
      training_data["train"]["sentence1"].append(entry[0])
      training_data["train"]["sentence2"].append("")
    # Generating Testing Data
    # print(labels_map)
    # print(te)
    for record in te:
      testing_data["test"]["label"].append(labels_map[record[1]])
      testing_data["test"]["sentence1"].append(record[0])
      testing_data["test"]["sentence2"].append("")
    # print(tr)
    # print(len(tr))
    # print(te)
    # print(len(te))
    
    print("Training")
    # print("Training data:" + str(training_data))
    # Getting training data along with labels 
    train_data = bert_encode(training_data['train'], tokenizer)
    train_labels = training_data['train']['label']
    test_data = bert_encode(testing_data['test'], tokenizer)
    test_labels = testing_data['test']['label']
    # Set up epochs and steps
    epochs = 3
    batch_size = 8

    train_data_size = len(train_labels)
    steps_per_epoch = int(train_data_size / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

    # creates an optimizer with learning rate schedule
    optimizer = nlp.optimization.create_optimizer(
        5e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)
    
    # Train the model
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    bert_classifier.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics)
    bert_classifier.fit(
          x=train_data, y=np.array(train_labels),
          validation_data=(test_data,np.array(test_labels)),
          batch_size=batch_size,
          epochs=epochs)
    

Fold: 1
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold: 2
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold: 3
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold: 4
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold: 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
# Saving the model 
# bert_classifier.save('bert_classifier.h5') 
# Saving the model 
bert_classifier.save('drive/My Drive/TrainedModel/bert_classifier.h5') 

In [14]:
# Downloading Save model 
# from google.colab import files
# files.download("/content/bert_classifier.h5")


In [15]:
new_model = tf.keras.models.load_model('bert_classifier.h5')

ValueError: ignored

In [19]:
import tensorflow as tf
a = tf.keras.models.load_model

In [26]:
from official.nlp.bert.bert_models import 