In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

In [2]:
# Load the data
train_data = pd.read_csv('tweets-train.csv')
valid_data = pd.read_csv('tweets-valid.csv')
test_data = pd.read_csv('tweets-test.csv')

In [3]:
# Extract the text and labels
train_tweets = train_data['tweet'].tolist()
train_labels = train_data['label'].tolist()
valid_tweets = valid_data['tweet'].tolist()
valid_labels = valid_data['label'].tolist()
test_tweets = test_data['tweet'].tolist()
test_labels = test_data['label'].tolist()

In [4]:
# Load the pre-trained transformer model and tokenizer
transformer_model = TFAutoModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
# Tokenize the input text
max_length = 280
train_encodings = tokenizer(train_tweets, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_tweets, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_tweets, truncation=True, padding=True, max_length=max_length)

In [6]:
# Convert labels to numpy arrays
train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)
test_labels = np.array(test_labels)

In [7]:
# Create input tensors
train_input_ids = tf.constant(pad_sequences(train_encodings['input_ids'], maxlen=max_length))
train_attention_mask = tf.constant(pad_sequences(train_encodings['attention_mask'], maxlen=max_length))
valid_input_ids = tf.constant(pad_sequences(valid_encodings['input_ids'], maxlen=max_length))
valid_attention_mask = tf.constant(pad_sequences(valid_encodings['attention_mask'], maxlen=max_length))
test_input_ids = tf.constant(pad_sequences(test_encodings['input_ids'], maxlen=max_length))
test_attention_mask = tf.constant(pad_sequences(test_encodings['attention_mask'], maxlen=max_length))

In [8]:
# Define the input layers
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

In [9]:
# Build the transformer model
transformer_output = transformer_model({'input_ids': input_ids, 'attention_mask': attention_mask})[0]

In [10]:
# Add a classification layer
output = Dense(units=1, activation='sigmoid')(transformer_output[:, 0, :])

In [11]:
# Make the BERT layers trainable
for layer in transformer_model.layers:
    layer.trainable = True

In [12]:
# Create the model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

In [13]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit([train_input_ids, train_attention_mask], train_labels, validation_data=([valid_input_ids, valid_attention_mask], valid_labels), epochs=5
          , batch_size=32)

Epoch 1/5
  7/379 [..............................] - ETA: 4:53:33 - loss: 2.7907 - accuracy: 0.3348