### Read training, dev and unlabeled test data

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [2]:
import csv

In [3]:
train, dev, test = [], [], []

In [4]:
with open('/content/gdrive/My Drive/HW4_upload/data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [5]:
with open('/content/gdrive/My Drive/HW4_upload/data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [6]:
with open('/content/gdrive/My Drive/HW4_upload/data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [None]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []
!pip install transformers

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Reshape
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np

In [9]:
from keras.callbacks import EarlyStopping
from keras.models import load_model #, Sequential
from keras.layers import Dense, Dropout

In [10]:
SEQ_LEN = 512

In [11]:
#Assemble training data
x_train = []
y_train = []

for x in train:
  new_x = '<s>' + x[0] + '</s></s>' + x[1] + '</s>'
  x_train.append(new_x)
  y_train.append(x[2])

y_train = [int(y) for y in y_train]
#y_train = tf.cast(y_train, dtype=tf.int32)

In [12]:
#Assemble development data
x_dev = []
y_dev = []

for x in dev:
  new_x = '<s>' + x[0] + '</s></s>' + x[1] + '</s>'
  x_dev.append(new_x)
  y_dev.append(x[2])

y_dev = [int(y) for y in y_dev]
#y_dev = tf.cast(y_dev, dtype=tf.int32)

In [13]:
#Assemble test data
x_test = []

for x in test:
  new_x = '<s>' + x[0] + '</s></s>' + x[1] + '</s>'
  x_test.append(new_x)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

In [15]:
x_train_ids = np.zeros((len(train), SEQ_LEN))
x_train_mask = np.zeros((len(train), SEQ_LEN))

x_dev_ids = np.zeros((len(dev), SEQ_LEN))
x_dev_mask = np.zeros((len(dev), SEQ_LEN))

x_test_ids = np.zeros((len(test), SEQ_LEN))
x_test_mask = np.zeros((len(test), SEQ_LEN))

for i, sequence in enumerate(x_train):
  tokens = tokenizer.encode_plus(sequence, max_length=SEQ_LEN, truncation=True, padding='max_length',
                                 add_special_tokens=True, return_token_type_ids=False,
                                 return_attention_mask=True, return_tensors='tf')
  x_train_ids[i, :], x_train_mask[i, :] = tokens['input_ids'], tokens['attention_mask']


for i, sequence in enumerate(x_dev):
  tokens = tokenizer.encode_plus(sequence, max_length=SEQ_LEN, truncation=True, padding='max_length',
                                 add_special_tokens=True, return_token_type_ids=False,
                                 return_attention_mask=True, return_tensors='tf')
  x_dev_ids[i, :] = tokens['input_ids']
  x_dev_mask[i, :] = tokens['attention_mask']
  #x_dev_ids.append(tokens['input_ids'])
  #x_dev_mask.append(tokens['attention_mask'])


for i, sequence in enumerate(x_test):
  tokens = tokenizer.encode_plus(sequence, max_length=SEQ_LEN, truncation=True, padding='max_length',
                                 add_special_tokens=True, return_token_type_ids=False,
                                 return_attention_mask=True, return_tensors='tf')
  x_test_ids[i, :] = tokens['input_ids']
  x_test_mask[i, :] = tokens['attention_mask']
  #x_test_ids.append(tokens['input_ids'])
  #x_test_mask.append(tokens['attention_mask'])


In [40]:
def map_func(input_ids, masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': masks
    }, labels
  
def test_map_func(input_ids, masks):
  return {
      'input_ids': input_ids,
      'attention_mask': masks
  }

In [41]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_ids, x_train_mask, y_train))
dev_dataset = tf.data.Dataset.from_tensor_slices((x_dev_ids, x_dev_mask, y_dev))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test_ids, x_test_mask))

In [42]:
train_dataset = train_dataset.map(map_func)
dev_dataset = dev_dataset.map(map_func)
test_dataset = test_dataset.map(test_map_func)

In [43]:
#train_dataset = train_dataset.shuffle(100000).batch(32, drop_remainder=True)
train_dataset = train_dataset.batch(32, drop_remainder=False)
#dev_dataset = dev_dataset.shuffle(100000).batch(32, drop_remainder=True)
dev_dataset = dev_dataset.batch(32, drop_remainder=False)
test_dataset = test_dataset.batch(32, drop_remainder=False)

TRAIN_DS_LEN = len(list(train_dataset))
print('train ds is', TRAIN_DS_LEN)
DEV_DS_LEN = len(list(dev_dataset))
print('dev ds is', DEV_DS_LEN)
TEST_DS_LEN = len(list(test_dataset))
print('test ds is', TEST_DS_LEN)

train ds is 187
dev ds is 33
test ds is 152


In [None]:
roberta = TFAutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

In [21]:
input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

embeddings = roberta(input_ids, attention_mask=mask)[0]

In [None]:
x = tf.keras.layers.BatchNormalization()(embeddings)

x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)

y = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False

model.summary()

In [23]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

callback = EarlyStopping(monitor='loss', patience=2, mode='min')

In [36]:
#Train on training data
history = model.fit(
    train_dataset,
    epochs=10,
    callbacks=callback,
    validation_data=(dev_dataset)
)

Epoch 1/10
Epoch 2/10
  1/187 [..............................] - ETA: 1:14:15 - loss: 0.4974 - accuracy: 1.0000

KeyboardInterrupt: ignored

In [None]:
#Save Model
my_model_path = '/content/gdrive/My Drive/HW4_upload/saved_models/batch_32'
model.save(my_model_path)

In [None]:
#Load Model
#model = tf.keras.models.load_model('/content/gdrive/My Drive/HW4_upload/saved_models/batch_32')

In [38]:
#Evaluate on development data
loss, accuracy = model.evaluate(dev_dataset)
print('loss: ', loss)
print('accuracy: ', accuracy)

loss:  0.5341851711273193
accuracy:  1.0


In [44]:
#Predict on test data
y_hat = model.predict(test_dataset)
y_hat_modified = [0 if val <0.5 else 1 for val in y_hat]
results = y_hat_modified

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [46]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [47]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [49]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('/content/gdrive/My Drive/HW4_upload/upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')