In [1]:
import pandas as pd
import numpy as np

In [2]:
df6=pd.read_csv('cleaned_bert_dataset.csv')

In [3]:
df6.head()

Unnamed: 0.1,Unnamed: 0,cleaned_text,sentiment_label
0,1,purchased device worked advertised never much ...,0
1,2,works expected sprung higher capacity think ma...,1
2,3,think worked greathad diff bran gb card went s...,1
3,4,bought retail packaging arrived legit orange e...,1
4,5,mini storage doesnt anything else supposed pur...,0


In [4]:
df6=df6.drop(['Unnamed: 0'],axis='columns')

In [5]:
df6.head()

Unnamed: 0,cleaned_text,sentiment_label
0,purchased device worked advertised never much ...,0
1,works expected sprung higher capacity think ma...,1
2,think worked greathad diff bran gb card went s...,1
3,bought retail packaging arrived legit orange e...,1
4,mini storage doesnt anything else supposed pur...,0


In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datasets import Dataset
import torch
import pandas as pd
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(
    df6['cleaned_text'], df6['sentiment_label'], test_size=0.2, random_state=42
)

#Convert Data into Hugging Face Dataset Format
train_data = Dataset.from_dict({"text": X_train.tolist(), "label": y_train.tolist()})
test_data = Dataset.from_dict({"text": X_test.tolist(), "label": y_test.tolist()})

#Tokenize the Data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
)

#compute_metrics
def compute_metrics(p):
    predictions = torch.tensor(p.predictions)  # Convert predictions to tensor
    preds = torch.argmax(predictions, axis=1).numpy()  # Get the predicted labels
    labels = p.label_ids  # True labels
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": classification_report(labels, preds, output_dict=True)["weighted avg"]["f1-score"],
    }

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#Train model
trainer.train()

#Evaluate the Model
results = trainer.evaluate()
print("Evaluation Results:", results)

#Additional Metrics: Accuracy, Classification Report, Confusion Matrix
test_results = trainer.predict(test_data)
y_true = test_results.label_ids
y_pred = np.argmax(test_results.predictions, axis=1)

print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))





Map:   0%|          | 0/3930 [00:00<?, ? examples/s]

Map:   0%|          | 0/983 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1238,0.287166,0.896236,0.898473
2,0.1516,0.387735,0.911495,0.914037
3,0.195,0.421419,0.912513,0.913653


Evaluation Results: {'eval_loss': 0.28716564178466797, 'eval_accuracy': 0.896236012207528, 'eval_f1': 0.8984727705624751, 'eval_runtime': 670.6275, 'eval_samples_per_second': 1.466, 'eval_steps_per_second': 0.183, 'epoch': 3.0}
Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

    Negative       0.71      0.80      0.75       193
    Positive       0.95      0.92      0.93       790

    accuracy                           0.90       983
   macro avg       0.83      0.86      0.84       983
weighted avg       0.90      0.90      0.90       983


Confusion Matrix:
[[154  39]
 [ 63 727]]


In [7]:
#prediction Function
def predict_analysis(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, axis=1).item()
    sentiment = "Positive" if prediction == 1 else "Negative"
    return f"The sentiment of the input text is: {sentiment}"

In [8]:
predict_analysis('Good')

'The sentiment of the input text is: Positive'

In [9]:
predict_analysis('Not Good')

'The sentiment of the input text is: Negative'

In [10]:
predict_analysis('I hate')

'The sentiment of the input text is: Negative'

In [11]:
predict_analysis('I love this')

'The sentiment of the input text is: Positive'

In [12]:
#export directory
export_dir = "./bert_sentiment_model"

#save model
model.save_pretrained(export_dir)

#save_tokenizer
tokenizer.save_pretrained(export_dir)

print(f"Model and tokenizer saved to {export_dir}")


Model and tokenizer saved to ./bert_sentiment_model


The model is saved in model.safetensors this for mat. so if you want to export as pytorch_model.bin use below.

In [13]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_directory = "./bert_pytorch_model"
model.save_pretrained(save_directory, safe_serialization=False)
tokenizer.save_pretrained(save_directory)

print(f"Model saved in PyTorch format at {save_directory}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model saved in PyTorch format at ./bert_pytorch_model
