Step 1: Import the necessary libraries

In [7]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

Load Data

In [8]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['OM_Regular'].tolist()
    labels = [1 if sentiment == "P" else 0 for sentiment in df['OM_Prediction'].tolist()]
    return texts, labels

In [9]:
data_file = "9_OM_dynamic_40p_60np.csv"
texts, labels = load_imdb_data(data_file)

Step 3: Create a custom dataset class for text classification

In [10]:
class TextClassificationDataset(Dataset):
      def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
      def __len__(self):
        return len(self.texts)
      def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

Step 4: Build our custom BERT classifier

In [11]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

Step 5: Define the train() function

In [12]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

Step 6: Build our evaluation method


In [13]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

Step 7: Build our prediction method

In [14]:
def predict_sentiment(text, model, tokenizer, device, max_length=500):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        return "P" if preds.item() == 1 else "NP"

Step 8: Define our model’s parameters

In [15]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 500
batch_size = 16
num_epochs = 6
learning_rate = 2e-5

Step 9: Loading and splitting the data.

In [16]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)


Step 10: Initialize tokenizer, dataset, and data loader

In [17]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Step 11: Set up the device and model

In [18]:
#torch.cuda.empty_cache()

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Step 12: Set up optimizer and learning rate scheduler

In [20]:
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Step 13: Training the model

In [21]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/6
Validation Accuracy: 0.8259
              precision    recall  f1-score   support

           0       0.98      0.72      0.83      2390
           1       0.70      0.98      0.82      1591

    accuracy                           0.83      3981
   macro avg       0.84      0.85      0.83      3981
weighted avg       0.87      0.83      0.83      3981

Epoch 2/6
Validation Accuracy: 0.8259
              precision    recall  f1-score   support

           0       0.98      0.72      0.83      2390
           1       0.70      0.98      0.82      1591

    accuracy                           0.83      3981
   macro avg       0.84      0.85      0.83      3981
weighted avg       0.87      0.83      0.83      3981

Epoch 3/6
Validation Accuracy: 0.8259
              precision    recall  f1-score   support

           0       0.98      0.72      0.83      2390
           1       0.70      0.98      0.82      1591

    accuracy                           0.83      3981
   macro avg  

Saving the trained model

In [22]:
torch.save(model.state_dict(), "bert_classifier_all_ds.pth")

In [23]:
# # ORM status prediction
# test_text = ""
# sentiment = predict_sentiment(test_text, model, tokenizer)
# print("")
# print(f"status: {sentiment}")

Step 14: Evaluating our model’s performance

In [24]:
import pandas as pd

# Load CSV file
df = pd.read_excel("Customer_Order_testset.xlsx")  # Update with your file path

# Assuming 'OM_Regular' is the column name
texts = df['OM_Regular']

# Iterate through each text in the column
for test_text in texts:
    sentiment = predict_sentiment(test_text, model, tokenizer, device)
    print("")
    print(f"status: {sentiment}")



status: NP

status: NP

status: NP

status: NP

status: NP

status: NP

status: NP

status: NP


Calculating Results from separate Testset

In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, classification_report

In [2]:
dc = pd.read_excel('Customer_Order_testset.xlsx')

In [3]:
X_test2 = dc['OM_Regular'].values
y_test2 = dc['OM_Prediction'].values

In [4]:
print(X_test2.shape)
print(y_test2.shape)

print("X data type: ", X_test2.dtype)
print("y data type: ", y_test2.dtype)

(8,)
(8,)
X data type:  object
y data type:  int64


In [5]:
print(y_test2)

[0 0 0 0 0 1 1 0]


In [6]:
dd = pd.read_excel('Customer_Order_pred_testset_2.xlsx')

In [7]:
X_test_pred2 = dd['OM_Regular'].values
y_test_pred2 = dd['OM_Prediction'].values

In [8]:
print (y_test_pred2 )

[0 0 0 0 0 0 0 0]


In [9]:
precision = precision_score(y_test2, y_test_pred2)
print("Testing: Precision = %f" % precision)


recall = recall_score(y_test2, y_test_pred2)
print("Testing: Recall = %f" % recall)


f1 = f1_score(y_test2, y_test_pred2)
print("Testing: F1 Score = %f" % f1)

print("\nConfusion Matrix (Test Data):\n", confusion_matrix(y_test2, y_test_pred2))

Testing: Precision = 0.000000
Testing: Recall = 0.000000
Testing: F1 Score = 0.000000

Confusion Matrix (Test Data):
 [[6 0]
 [2 0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print(classification_report(y_test2,y_test_pred2))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       0.00      0.00      0.00         2

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
