In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv('drive/MyDrive/PJAIT/ZUM/Twitter_Clustered_Data.csv')

In [5]:
df = df.sample(50000).reset_index(drop=True)

# Since the Bert training will last so long I will use 50K row of the data

In [7]:
texts = df['tweet'].values
labels = df['Cluster'].values


label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)


train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)


model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


model.fit(train_dataset.shuffle(1000),
          epochs=4,
          batch_size=16,
          validation_data=test_dataset.shuffle(1000))


_, accuracy = model.evaluate(test_dataset)
print("Test Accuracy:", accuracy)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test Accuracy: 0.9531000256538391


In [8]:
import joblib


save_path = 'BERT_model'
tokenizer_path = 'BERT_model_tokenizer'


model.save_pretrained(save_path)
tokenizer.save_pretrained(tokenizer_path)


label_encoder_path = 'BERT_LABEL_encoder'
joblib.dump(label_encoder, label_encoder_path)

['BERT_LABEL_encoder']