In [1]:
from transformers import DistilBertTokenizerFast

In [2]:
from ast import literal_eval

In [3]:
import pandas as pd

In [4]:
import json
import pickle as pkl
import requests
import urllib.request
import tensorflow as tf

In [19]:
# create the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# load in the dataset
url = 'https://github.com/Robert-MacWha/Project-Aurras/blob/27e95578b610699ece51fc8d2f183c03aa09ca15/dataset/train.pkl'

with open("train.pkl", "rb") as f:
    object = pkl.load(f)

df = pd.DataFrame(object)
df.to_csv(r'train.csv')

df_train = pd.read_csv('train.csv')
# load json data
urljson = 'https://raw.githubusercontent.com/Robert-MacWha/Project-Aurras/main/dataset/intent_labels.json'
resp = requests.get(urljson)
intent_labels = json.loads(resp.text)
intent_count = len(intent_labels)

In [6]:
# using tokenizer to convert the text into numeric input IDs

inputs = tokenizer(
    list(df_train['word_entities']),      # specify the string[] to tokenize
    max_length=128,               # custom padding
    padding='max_length',         # sets padding to the custom value
    return_attention_mask=True,
    return_token_type_ids=False,
    return_tensors='np'           # flag to return numpy array
)
x_train_ids = inputs['input_ids']
x_train_attention = inputs['attention_mask']

In [7]:
# example of a single sentence after tokenization

print(x_train_ids)
# first 7 are etxt IDs, rest are padding*

print(x_train_attention)
# ones represent parts of the sentence, zeroes represent padded tokens

[[ 101 1031 2321 ...    0    0    0]
 [ 101 1031 2321 ...    0    0    0]
 [ 101 1031 2321 ...    0    0    0]
 ...
 [ 101 1031 2321 ...    0    0    0]
 [ 101 1031 2321 ...    0    0    0]
 [ 101 1031 2321 ...    0    0    0]]
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]


In [8]:
# convert the y_labels into one-hot* labels using tensorflow built-in function
y_train_intents = tf.one_hot(df_train['prompt_intent'].values, intent_count)

In [9]:
# sample datapoint
print(f'Prompt:         {df_train["word_entities"][0]}')
print(f'Token IDs:      {x_train_ids[0][:12]}...')
print(f'Attention mask: {x_train_attention[0][:12]}...')
print(f'One-hot Label:  {y_train_intents[0]}')
# Prompt: can you calculate twelve point five plus two Token
# IDs: [101 2064 2017 18422 4376 2391 2274 4606 2048, 102 0 0]... 
# Attention mask: [1 1 1 1 1 1 1 1 1 1 0 0]... 
# One-hot Label: [1. 0. 0. 0. 0.]

Prompt:         [15, 0, 0, 0, 18, 0, 0, 14, 0]
Token IDs:      [ 101 1031 2321 1010 1014 1010 1014 1010 1014 1010 2324 1010]...
Attention mask: [1 1 1 1 1 1 1 1 1 1 1 1]...
One-hot Label:  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [10]:
from transformers import TFDistilBertModel

In [11]:
# define the input layers
input_ids_layer = tf.keras.layers.Input(
    shape=(128,),       # shape value matches value of padding used earlier
    name='input_ids',
    dtype='int32',
)

input_attention_layer = tf.keras.layers.Input(
    shape=(128,),
    name='input_attention',
    dtype='int32',
)

In [12]:
#create the pre-trained model
transfomer = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

#feed the inputs into the pre-trained model
#results in a layer of shape 

last_hidden_state = transfomer([
    input_ids_layer,
    input_attention_layer])[0]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [13]:
# the cls token contains a condensed representation of the entire last_hidden_state tensor
cls_token = last_hidden_state[:, 0, :]

weight_initializer = tf.keras.initializers.GlorotUniform()

# create the output layer
intent_output = tf.keras.layers.Dense(
    intent_count,
    activation='softmax',
    kernel_initializer=weight_initializer,
    kernel_constraint=None,
    bias_initializer='zeros',
    name='intent_output'
)(cls_token)

In [14]:
model = tf.keras.Model(
        [input_ids_layer, input_attention_layer],
        [intent_output])

print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 input_attention (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 BertModel)                     ast_hidden_state=(N               'input_attention[0][0]']        
                                one, 128, 768),                                                   
                                 hidden_states=None                                           

In [16]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss      = tf.keras.losses.CategoricalCrossentropy(),
    metrics   = [tf.keras.metrics.CategoricalAccuracy('categorical_accuracy')])

In [17]:
#train the model
history = model.fit(
    x = [x_train_ids, x_train_attention],
    y = [y_train_intents],
    epochs = 2,
    batch_size = 16,
    verbose = 1
)

Epoch 1/2
Epoch 2/2


In [None]:
while True:
    # Get user input
    user_input = input("Please enter a sentence: ")

    # Convert user input to token IDs and attention mask
    user_input_tokens = tokenizer(
        user_input,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='tf'
    )
    user_input_ids = user_input_tokens['input_ids']
    user_input_attention = user_input_tokens['attention_mask']

    # Make prediction
    prediction = model.predict([user_input_ids, user_input_attention])
    predicted_label = intent_labels[str(tf.argmax(prediction, axis=1).numpy()[0])]

    # Print result
    print(f"Predicted intent: {predicted_label}\n")

Please enter a sentence: what is the time
Predicted intent: stop

Please enter a sentence: what is the weather
Predicted intent: rewind_music

Please enter a sentence: what's two times four
Predicted intent: fast_forward_music

Please enter a sentence: pause
Predicted intent: stop

Please enter a sentence: play
Predicted intent: fast_forward_music

Please enter a sentence: skip
Predicted intent: fast_forward_music

Please enter a sentence: volume down
Predicted intent: stop

