# Template code for Homework 6

As you work through each subsection, you may notice that some lines of code have been intentionally left incomplete. Fill out the necessary code to progress through the homework.

In [None]:
# We'll start with our library imports...
from __future__ import print_function

import numpy as np                 # to use numpy arrays
import tensorflow as tf            # to specify and run computation graphs
import tensorflow_datasets as tfds # to load training data
import matplotlib.pyplot as plt    # to visualize data and draw plots
from tqdm import tqdm              # to track progress of loops
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset
import seaborn as sns

DATA_DIR = './tensorflow-datasets/'

In [None]:
# Download model and tokenizer from an available task specific checkpoint
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"

tokenizer = # AutoTokenizer.from_pretrained(model_name)
model = # TFAutoModelForSequenceClassification.from_pretrained(model_name)

### Prepare test and train datasets. 

At the end of this subsection, you will have 
- x_train, y_train
- x_valid, y_valid
- x_test, y_test

Run the cells of this subsection as-is for the homework

In [None]:
# Downloading SNLI dataset (Stanford Natural Language Inference dataset) from HuggingFace
snli = load_dataset("stanfordnlp/snli").shuffle(seed=42)

In [None]:
snli.keys() # The dataset is downloaded as a dict

In [None]:
# We will use a subset to avoid longer training time. Feel free to change the number of samples.
snli_train = snli["train"].take(1000)
snli_valid = snli["validation"].take(50)
snli_test = snli["train"].take(10)

In [None]:
# create a tuple (premise, hypothesis) to be used as input features (x_train) in model.fit()
def preprocess_fn(sample):
    sample["sentence_pair"] = tuple((sample["premise"].lower(), sample["hypothesis"].lower()))
    return sample

snli_train = snli_train.map(preprocess_fn, remove_columns=["premise", "hypothesis"])
snli_valid = snli_valid.map(preprocess_fn, remove_columns=["premise", "hypothesis"])
snli_test = snli_test.map(preprocess_fn, remove_columns=["premise", "hypothesis"])

In [None]:
# we will use Pandas DataFrames to make our data processing easier
snli_train_df = pd.DataFrame(snli_train)
snli_valid_df = pd.DataFrame(snli_valid)
snli_test_df = pd.DataFrame(snli_test)

In [None]:
# filter out rows with incorrect labels
snli_train_df = snli_train_df[snli_train_df["label"] != -1]
snli_valid_df = snli_valid_df[snli_valid_df["label"] != -1]
snli_test_df = snli_test_df[snli_test_df["label"] != -1]

In [None]:
snli_train_df.head(2)

In [None]:
# pass the sentence pairs through a tokenizer
x_train = tokenizer(snli_train_df["sentence_pair"].to_list(), padding=True, return_tensors="tf").data
x_valid = tokenizer(snli_valid_df["sentence_pair"].to_list(), padding=True, return_tensors="tf").data
x_test = tokenizer(snli_test_df["sentence_pair"].to_list(), padding=True, return_tensors="tf").data

In [None]:
# processing labels
y_train = tf.constant(snli_train_df["label"].to_list())
y_valid = tf.constant(snli_valid_df["label"].to_list())
y_test = tf.constant(snli_test_df["label"].to_list())

### Visualize attention by running a test sample through an untrained model

- Some lines of code have been intentionally left incomplete. Fill out the necessary code to progress through Homework 6

In [None]:
# pass test data through untrained model
outputs_before_training = # model(**x_test, return_dict=True, output_attentions=True, output_hidden_states=True)
outputs_before_training.keys()

In [None]:
# Use logits for generating predictions
Y_probas_before_training = # tf.keras.activations.softmax(outputs_before_training.logits)
Y_pred_before_training = # tf.argmax(Y_probas_before_training, axis=1)

print(Y_pred_before_training)
print(y_test)

In [None]:
len(outputs_before_training["attentions"]) # layers

In [None]:
outputs_before_training["attentions"][5].shape # batch, heads, seq, seq

In [None]:
# select layer and sample, same as above
# visualizing first sample, all attention heads of layer 0 of the encoder

def visualize_attentions(attentions, layer=0, sample=0):
    
    layer_num=layer
    sample_num=sample
    attentions=outputs_before_training["attentions"][layer_num][sample_num, :, :, :] # layer0, input-sample 0
    attentions.shape
    
    plt.rcParams["figure.figsize"] = [12, 12]
    plt.rcParams["figure.autolayout"] = True
    
    
    fig, ax = plt.subplots(nrows=4, ncols=3, sharey=True) # 12 = 4x3
    fig.subplots_adjust(wspace=0.01)
    
    tick_labels = tokenizer.convert_ids_to_tokens(x_test["input_ids"][sample_num])
    
    i=0
    for r in range(4):
        for c in range(3):
    
            cbar=True if c==2 else False
    
            sns.heatmap(attentions[i],
                        xticklabels=tick_labels,
                        yticklabels=tick_labels,
                        cmap="plasma",
                        ax=ax[r][c],
                        cbar=cbar)
            i+=1
    
    fig.subplots_adjust(wspace=0.001)
    fig.tight_layout()
    plt.show()

In [None]:
visualize_attentions(attentions=outputs_before_training["attentions"], layer=0, sample=0)

### Train the model, then evaluate

- Some lines of code have been intentionally left incomplete. Fill out the necessary code to progress through Homework 6

In [None]:
# Compile model
loss = #tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer="nadam", metrics=["accuracy"])
callback = tf.keras.callbacks.EarlyStopping(patience=3) # feel free to experiment with different values of patience

In [None]:
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_valid, y_valid), callbacks=[callback], verbose=1) # feel free to experiment with different # epochs

### Evaluate and visualize attention after finetuning

- Some lines of code have been intentionally left incomplete. Fill out the necessary code to progress through Homework 6

In [None]:
# Pass the test set through the finetuned model
outputs_after_training = # model(**x_test, return_dict=True, output_attentions=True, output_hidden_states=True)
outputs_after_training.keys()

In [None]:
Y_probas_after_training = #tf.keras.activations.softmax(outputs_after_training.logits)
Y_pred_after_training = #tf.argmax(Y_probas_after_training, axis=1)
print(Y_pred_after_training)
print(y_test)

In [None]:
len(outputs_after_training["attentions"]) # layers

In [None]:
outputs_after_training["attentions"][5].shape # batch, heads, seq, seq

In [None]:
visualize_attentions(attentions=outputs_after_training["attentions"], layer=0, sample=0)