<a href="https://colab.research.google.com/github/sergiumr/intelligent-systems/blob/master/ModelA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**## Model A**

**Setup**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

In [None]:
import os
import json
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers

from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Flatten, Dropout, MaxPooling1D, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/SI/ambiguous_knights_knaves.json'

physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

AssertionError: ignored

In [None]:
def read_json_file(path):
  with open(path, 'r') as json_file:
    result = json.load(json_file)

  return result['puzzles']

**Load the data**

In [None]:

df = pd.DataFrame(read_json_file(DATA_PATH))[['pid', 'puzzle_text', 'QA']]

starting_pid = 0
data_frame = pd.DataFrame(eval(json.dumps(df['QA'][starting_pid])))
data_frame['pid'] = df['pid'][starting_pid]
data_frame['puzzle_text'] = df['puzzle_text'][starting_pid]

for pid in range(1, len(df)):
  qa = pd.DataFrame(eval(json.dumps(df['QA'][pid])))
  qa['pid'] = df['pid'][pid]
  qa['puzzle_text'] = df['puzzle_text'][pid]
  data_frame = data_frame.append(qa)

print(data_frame)

    qid               question                    answer  pid  \
0     1    Is Rex the knight ?  NOT ENTAILMENT - Unknown    1   
1     2     Is Rex the knave ?  NOT ENTAILMENT - Unknown    1   
2     3    Is Bob the knight ?  NOT ENTAILMENT - Unknown    1   
3     4     Is Bob the knave ?  NOT ENTAILMENT - Unknown    1   
4     5  Is Alice the knight ?  NOT ENTAILMENT - Unknown    1   
..  ...                    ...                       ...  ...   
3     4     Is Bob the knave ?  NOT ENTAILMENT - Unknown  150   
4     5   Is Dave the knight ?  NOT ENTAILMENT - Unknown  150   
5     6    Is Dave the knave ?  NOT ENTAILMENT - Unknown  150   
6     7  Is Sally the knight ?  NOT ENTAILMENT - Unknown  150   
7     8   Is Sally the knave ?  NOT ENTAILMENT - Unknown  150   

                                          puzzle_text  
0   On the island where each inhabitant is either ...  
1   On the island where each inhabitant is either ...  
2   On the island where each inhabitant is either .

**Configuration**

In [None]:
print("Unique labels: ")
print(data_frame['answer'].unique())

Unique labels: 
['NOT ENTAILMENT - Unknown' 'Entailment' 'NOT ENTAILMENT - Contradiction']


In [None]:
print("Number of missing values")
print(data_frame.isnull().sum())

Number of missing values
qid            0
question       0
answer         0
pid            0
puzzle_text    0
dtype: int64


In [None]:
print("Data values distribution")
print(data_frame['answer'].value_counts())

Data values distribution
NOT ENTAILMENT - Unknown          730
Entailment                        105
NOT ENTAILMENT - Contradiction    105
Name: answer, dtype: int64


In [None]:
print(data_frame.columns)

train_df = data_frame.sample(frac = 0.6)
print(train_df)

valid_df = data_frame.sample(frac = 0.25)
print(valid_df)

test_df = data_frame.sample(frac = 0.15)
print(test_df)

Index(['qid', 'question', 'answer', 'pid', 'puzzle_text'], dtype='object')
    qid               question                    answer  pid  \
5     6     Is Joe the knave ?  NOT ENTAILMENT - Unknown  120   
4     5  Is Alice the knight ?  NOT ENTAILMENT - Unknown    1   
1     2   Is Peggy the knave ?  NOT ENTAILMENT - Unknown  145   
0     1  Is Peggy the knight ?  NOT ENTAILMENT - Unknown  106   
2     3    Is Mel the knight ?  NOT ENTAILMENT - Unknown   81   
..  ...                    ...                       ...  ...   
3     4     Is Sue the knave ?  NOT ENTAILMENT - Unknown  142   
1     2   Is Alice the knave ?  NOT ENTAILMENT - Unknown  126   
2     3    Is Ted the knight ?  NOT ENTAILMENT - Unknown   16   
3     4    Is Dave the knave ?  NOT ENTAILMENT - Unknown   57   
2     3    Is Mel the knight ?  NOT ENTAILMENT - Unknown   69   

                                          puzzle_text  
5   On the island where each inhabitant is either ...  
4   On the island where each inh

**Distribution of our training targets.**

In [None]:
print("Train Target Distribution")
print(train_df.answer.value_counts())

Train Target Distribution
NOT ENTAILMENT - Unknown          431
Entailment                         69
NOT ENTAILMENT - Contradiction     64
Name: answer, dtype: int64


In [None]:
print("Validation Target Distribution")
print(valid_df.answer.value_counts())

Validation Target Distribution
NOT ENTAILMENT - Unknown          183
NOT ENTAILMENT - Contradiction     30
Entailment                         22
Name: answer, dtype: int64


In [None]:
print("Test Target Distribution")
print(test_df.answer.value_counts())

Test Target Distribution
NOT ENTAILMENT - Unknown          110
Entailment                         20
NOT ENTAILMENT - Contradiction     11
Name: answer, dtype: int64


**Configuration**

In [None]:
max_length = len(data_frame)  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

# Labels in our dataset.
labels = ["NOT ENAILMENT - Unknown", "NOT ENAILMENT - Contradiction", "Entailment"]

NameError: ignored

**One-hot encoding**

In [None]:
train_df["label"] = train_df["answer"].apply(
    lambda x: 0 if x == "NOT ENAILMENT - Unknown" else 1 if x == "NOT ENAILMENT - Contradiction" else 2
)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=3)

valid_df["label"] = valid_df["answer"].apply(
    lambda x: 0 if x == "NOT ENAILMENT - Unknown" else 1 if x == "NOT ENAILMENT - Contradiction" else 2
)
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=3)

test_df["label"] = test_df["answer"].apply(
    lambda x: 0 if x == "NOT ENAILMENT - Unknown" else 1 if x == "NOT ENAILMENT - Contradiction" else 2
)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)

**Keras Custom Data Generator**

In [None]:

class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        #Shuffling the order in which examples are fed to the classifier 
        #is helpful so that batches between epochs do not look alike. Doing so will eventually make our model more robust.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)


**Let's build our model**

In [None]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(3, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7fb0f498a810>
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 940)]        0           []                               
                         

In [None]:
train_data = BertSemanticDataGenerator(
    train_df[["puzzle_text", "answer"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    valid_df[["puzzle_text", "answer"]].values.astype("str"),
    y_val,
    batch_size=batch_size,
    shuffle=False,
)

In [None]:
model = Sequential()
model.add(Embedding(len(data_frame), len(data_frame)))
model.add(LSTM(100))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Change the number of epochs and the batch size depending on the RAM Size

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,verbose = 1,validation_data=(X_cv,y_cv))

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, None, 940)         883600    
                                                                 
 lstm_24 (LSTM)              (None, 100)               416400    
                                                                 
 dense_24 (Dense)            (None, 1)                 101       
                                                                 
Total params: 1,300,101
Trainable params: 1,300,101
Non-trainable params: 0
_________________________________________________________________
None


ValueError: ignored