# HW09: Transformers

Remember that these homework work as a completion grade. **You can skip one section of this homework.**

In [1]:
#!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
#!pip install transformers
#!pip install scikit-learn

In [2]:
import pandas as pd
import nltk
df = pd.read_csv('train.csv')

N_SAMPLES = 10000 # only use N_SAMPLES datapoints
SEQLEN = 256 			# use masking for padded sequences of length 256
N_EPOCHS = 2			# train for N_EPOCHS
BATCHSIZE = 16		# use batchsize

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=N_SAMPLES) 
print(f'N_SAMPLES: {df.shape[0]}')
df.head()

N_SAMPLES: 10000


Unnamed: 0,label,title,lead,text
18373,sport,Decision appealing for Kapler and Nixon,"Beware, titans of the legal world. There may b...",Decision appealing for Kapler and Nixon Beware...
84421,world,Strong Earthquake Jolts Northern Japan,TOKYO (Reuters) - A strong earthquake with a ...,Strong Earthquake Jolts Northern Japan TOKYO ...
25686,sci/tech,Will Expand Beyond's Patents Lead to New Partn...,Ziff Davis - Wireless software provider Expand...,Will Expand Beyond's Patents Lead to New Partn...
42984,sport,American Express Cship: Woods eyes fourth titl...,LONDON: Three-times winner Tiger Woods bids fo...,American Express Cship: Woods eyes fourth titl...
93063,world,Congress Ready to Update Special Ed Law (AP),AP - Congressional negotiators have reached ag...,Congress Ready to Update Special Ed Law (AP) A...


## Hugginface Transformers

In [3]:
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig
import tensorflow as tf

config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
config.num_labels = 4
transformer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

tf.config.list_physical_devices('GPU')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
transformer_model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,956,548
Trainable params: 66,956,548
Non-trainable params: 0
_________________________________________________________________


In [5]:
##TODO build a transformer model to do sequence classification with the goal to predict the label from the text

input_ids = tf.keras.layers.Input(shape=(SEQLEN,), name='input_ids', dtype='int32')
input_masks_ids = tf.keras.layers.Input(shape=(SEQLEN,), name='attention_mask', dtype='int32')

# freeze pretrained weights to tune only classification head
transformer_model.distilbert.trainable = False 

# build model to handle input tokens and masks
X = transformer_model(input_ids, input_masks_ids)
model = tf.keras.Model(inputs=[input_ids, input_masks_ids], outputs = X)

In [6]:
##TODO print the summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_distil_bert_for_sequence_cl  TFSequenceClassifie  66956548   ['input_ids[0][0]',              
 assification (TFDistilBertForS  rOutput(loss=None,               'attention_mask[0][0]']         
 equenceClassification)         logits=(None, 4),                                                 
                                 hidden_states=None                                           

In [7]:
##TODO compile the model
from tensorflow.keras import optimizers, losses

optimizer = optimizers.Adam(learning_rate=3e-5) # use adam as the optimizer
loss = losses.SparseCategoricalCrossentropy(from_logits=True) # cost function
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # compute accuracy, for scoring

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']) 

**Hint:** All the vectorized pieces of text must have the same length (which will be equal to the input size). You have two options to ensure this:

1. Set the maximum length equal to the length of the shortest vectorized text
2. Choose the maximum length and then exclude all the data points that have vectors shorter than that length

**Hint:** Tensorflow requires your labels to be integers, not strings

In [8]:
##TODO split the sample into a training and a test set 
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))

labelEncoder = LabelEncoder().fit(df['label'])
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

##TODO prepare the dataset for tensorflow.
def prepare_ds(X, y):
    # tokenize dataset and get integer labels
    X_tf = tokenizer(X, return_tensors="tf", padding=True, truncation=True, max_length=SEQLEN)
    y_tf = labelEncoder.transform(y)

    # build batched tensorflow dataset
    return tf.data.Dataset.from_tensor_slices((dict(X_tf), y_tf)).batch(BATCHSIZE)

train_ds = prepare_ds(X_train, y_train)

8000 2000 8000 2000


In [9]:
##TODO fit the model and print the obtained accuracy
hist = model.fit(train_ds, epochs=N_EPOCHS)

Epoch 1/2
Epoch 2/2


In [15]:
# evaluate model on test split
test_ds = prepare_ds(X_test, y_test)
loss, acc = model.evaluate(test_ds)

print(f'accuracy: {acc}')

accuracy: 0.27900001406669617


# LSTMs

In [11]:
from keras.layers import LSTM

##TODO create a sequential model with an embedding layer, a LSTM layer and two hidden layers with ReLu activation function, followed by dropout

In [12]:
##TODO compile the model and fit it to predict the business label