In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:

# Load BERT tokenizer
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D, Concatenate, Bidirectional, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import ZeroPadding1D
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Load tweet data
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [None]:
# Load pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
# Set up input and output shape
max_length = 128
input_word_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_length,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_length,), dtype=tf.int32, name="segment_ids")


In [None]:
# Get BERT embeddings
bert_output = bert_model([input_word_ids, input_mask, segment_ids])[0]

In [None]:
bert_output.shape

TensorShape([None, 128, 768])

In [None]:
# Set up bi-LSTM layer
lstm_units = 64
lstm_output = Bidirectional(LSTM(lstm_units, return_sequences=True))(bert_output)

In [None]:
lstm_output.shape

TensorShape([None, 128, 128])

In [None]:
# Set up CNN layer
num_filters = 64
kernel_size = 3
conv_output = Conv1D(num_filters, kernel_size, activation='relu', padding='same')(lstm_output)
pool_output = MaxPooling1D(pool_size=2, strides=1, padding='same')(conv_output)

In [None]:
pool_output.shape

TensorShape([None, 128, 64])

In [None]:
# Concatenate the two inputs
from tensorflow.keras.layers import GlobalMaxPooling1D

concat_output = Concatenate()([lstm_output, pool_output])
reduced_output = GlobalMaxPooling1D()(concat_output)

In [None]:
# Set up dense layer and output layer
dense_units = 128
dropout_rate = 0.5
dense_output = Dense(dense_units, activation='relu')(reduced_output)
dropout_output = Dropout(dropout_rate)(dense_output)
output = Dense(1, activation='sigmoid')(dropout_output)

In [None]:
# Define model input and output
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)

In [None]:
# Compile model
learning_rate = 1e-5
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
# Print model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_word_ids[0][0]',         
                                thPoolingAndCrossAt               'input_mask[0][0]',         

In [None]:
# Load the dataset of tweets
file_path = "/content/drive/MyDrive/CS298/tweets.csv"
df = pd.read_csv(file_path)

In [None]:
# Convert tweets to BERT embeddings
input_ids = []
attention_masks = []
token_type_ids = []

for tweet in df['text']:
  encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens = True,
                        max_length = max_length,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'tf'
                   )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
  token_type_ids.append(encoded_dict['token_type_ids'])

input_ids = np.concatenate(input_ids, axis=0)
attention_masks = np.concatenate(attention_masks, axis=0)
token_type_ids = np.concatenate(token_type_ids, axis=0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Train model

batch_size = 32
epochs = 5
X_train_ids, X_test_ids, X_train_masks, X_test_masks, X_train_segments, X_test_segments, y_train, y_test = train_test_split(input_ids, attention_masks, token_type_ids, df['target'], test_size=0.2, random_state=42)


In [None]:
# Train model
history = model.fit(
    x=[X_train_ids, X_train_masks, X_train_segments],
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2)

Epoch 1/5






In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.show()

In [None]:
#test model

y_pred = model.predict([X_test_ids, X_test_masks, X_test_segments])
y_pred = np.round(y_pred.squeeze()).astype(int)

In [None]:
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
