In [29]:
pip install transformers



In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

In [31]:
# Load the dataset
# Assuming the dataset is in a file named 'dataset.xlr'
data = pd.read_excel('/content/Cricket.xlsx')

In [32]:
# Split the dataset into input texts and corresponding labels
texts = data['Text'].values
labels = data['Polarity'].values

In [33]:
# Load the BanglaBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')

In [34]:
# Tokenize the input texts
encoded_texts = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='tf')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [35]:
# Convert labels to integers (positive: 1, negative: 0)
labels = np.array([1 if label == 'positive' else 0 for label in labels])

In [39]:
# Split the data into training and validation sets
train_size = int(0.8 * len(encoded_texts['input_ids']))
train_inputs = {key: val[:train_size] for key, val in encoded_texts.items()}
train_labels = labels[:train_size]

val_inputs = {key: val[train_size:] for key, val in encoded_texts.items()}
val_labels = labels[train_size:]

In [40]:
# Load the pre-trained BanglaBERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [42]:
# Train the model
model.fit(train_inputs, train_labels, validation_data=(val_inputs, val_labels), epochs=5, batch_size=32)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x78c5d86e4700>

In [43]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(val_inputs, val_labels)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8489933013916016
