### Text Data Preprocessing

In [3]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

Reading the Excel File into a pandas DataFrame

In [4]:
# Load CSV data into a pandas DataFrame
df = pd.read_csv("data/labeled_sentences.csv")# Read the Excel file into a pandas DataFrame

Extracting the text and associated label for each excel row and storing them in _sentences_ and _labels_ respectively

In [5]:
# Extract sentences and labels from DataFrame
sentences = df["Sentence"].tolist()
labels = df["Label"].tolist()

# Split data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

In [6]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input texts
train_encodings = tokenizer(train_sentences, truncation=True, padding=True)
test_encodings = tokenizer(test_sentences, truncation=True, padding=True)

In [7]:
# Convert labels to numerical values
label2id = {label: i for i, label in enumerate(set(labels))}
train_labels = [label2id[label] for label in train_labels]
test_labels = [label2id[label] for label in test_labels]

# Convert data into TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

In [8]:
import keras
# Define batch size
batch_size = 32

# Shuffle and batch the datasets
train_dataset = train_dataset.shuffle(len(train_dataset)).batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

# Initialize BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))

# Define training parameters
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
model.fit(train_dataset, epochs=5)

# Evaluate the model
eval_result = model.evaluate(test_dataset)
print("Test accuracy:", eval_result[1])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5

KeyboardInterrupt: 