In [None]:
import pandas as pd

In [None]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

train

Unnamed: 0,hypotheis,premise,choice
0,Receiving Party shall not reverse engineer any...,2 3 1,Neutral
1,Receiving Party shall not reverse engineer any...,WHEREAS in connection with RFP 2014 620 Reques...,Neutral
2,Receiving Party shall not reverse engineer any...,4 Nothing in this Agreement is to be construed...,Neutral
3,Receiving Party shall not reverse engineer any...,5 All Confidential Information in any form and...,Neutral
4,Receiving Party shall not reverse engineer any...,8 This Agreement shall enter into force on the...,Neutral
...,...,...,...
76112,Receiving Party shall not use any Confidential...,Each Party acknowledges that money damages wou...,Neutral
76113,Receiving Party shall not use any Confidential...,Each Party in its capacity as a provider of in...,Neutral
76114,Receiving Party shall not use any Confidential...,Accordingly each Party will also be entitled t...,Neutral
76115,Receiving Party shall not use any Confidential...,12 Confidential Information,Neutral


In [None]:
train.count()

hypotheis    76117
premise      76117
choice       76117
dtype: int64

In [None]:
def change_label(x):
  if x == "Entailment":
    return 0
  elif x == "Contradiction":
    return 1
  else:
    return 2

train["choice"] = train.choice.apply(lambda x: change_label(x))
test["choice"]  = test.choice.apply(lambda x: change_label(x))

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

## Create a vocabulary index

In [None]:
import os

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train["hypotheis"] + train ["premise"]).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:5]


['', '[UNK]', 'the', 'party', 'of']

In [None]:
output = vectorizer([["Receiving Party shall not reverse engineer any objects which embody Disclosing Party s Confidential Information"]])
output.numpy()

array([[  9,   3,  10,  17,  99, 100,  12, 101,  37, 103,  20,   3,  25,
          6,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [None]:
#Here's a dict mapping words to their indices:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
test = ["Receiving", "Party", "shall", "not", "reverse", "engineer"]
[word_index[w.lower()] for w in test]

[9, 3, 10, 17, 99, 100]

## Load pre-trained word embeddings

#### 1. Download the zip file
#### 2. Unzip it

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2023-01-18 17:05:27--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-01-18 17:05:27--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-01-18 17:05:27--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

#### 3. Get the exact path of where the embedding vectors are extracted using
#### 4. Index the vectors

In [None]:
!ls
!pwd

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip  test.csv
glove.6B.200d.txt  glove.6B.50d.txt   sample_data   train.csv
/content


In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


### Next, we load the pre-trained word embeddings matrix into an Embedding layer. Now, let's prepare a corresponding embedding matrix that we can use in a Keras Embedding layer.

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 8626 words (7198 misses)


In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

## Build the model

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(3, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1582600   
                                                                 
 conv1d_3 (Conv1D)           (None, None, 128)         64128     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 conv1d_4 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, None, 128)        0         
 1D)                                                         

## Train the model

In [None]:
x_train = vectorizer(np.array([[s] for s in train["hypotheis"]])).numpy()
x_val = vectorizer(np.array([[s] for s in test["hypotheis"]])).numpy()

y_train = np.array(train["choice"])
y_val = np.array(test["choice"])

In [None]:
#https://stackoverflow.com/questions/56293964/categorical-focal-loss-on-keras
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4084b93f10>

## Next steps: https://keras.io/examples/nlp/pretrained_word_embeddings/