In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [3]:
# Load data
dtype_dict = {'Category': str} 
data = pd.read_csv("book4.csv", dtype=dtype_dict, nrows=133)
category_map = {"\"Adversal Effects\"": 0, "\"Product Quality\"": 1, "\"Medical Information\"": 2}
data['label'] = data['Category'].map(category_map)

# Handle missing values
data['review'] = data['review'].fillna("No information available")

In [4]:
data.head(50)

Unnamed: 0,Category,review,label
0,"""Adversal Effects""","""Started taking Tylenol for my headache, but e...",0.0
1,"""Adversal Effects""","""Feeling dizzy and anxious after trying Advil ...",0.0
2,"""Adversal Effects""","""Allegra was supposed to help with my allergie...",0.0
3,"""Adversal Effects""","""Feeling down and discouraged after experienci...",0.0
4,"""Adversal Effects""","""Zyrtec seemed like a good choice for my aller...",0.0
5,"""Adversal Effects""","""2nd day on 5mg started to work with rock hard...",0.0
6,"""Adversal Effects""","""Started Prozac for my anxiety, but now I'm fe...",0.0
7,"""Adversal Effects""","""Crestor was supposed to lower my cholesterol,...",0.0
8,"""Adversal Effects""",""" I Ve had nothing but problems with the Kepp...",0.0
9,"""Adversal Effects""","""Feeling exhausted and drained after trying Cl...",0.0


In [5]:
grouped_data = data.groupby('Category').size().reset_index(name='Count')
print(grouped_data)

                Category  Count
0     "Adversal Effects"     32
1  "Medical Information"     32
2      "Product Quality"     28


In [6]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Encode data
input_ids = []
attention_masks = []

for review in data['review']:
    encoded_dict = tokenizer.encode_plus(
                        review,
                        add_special_tokens=True,
                        max_length=128,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt'
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])   
# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['label'].values)


In [7]:
print(input_ids)

tensor([[ 101, 1000, 2318,  ...,    0,    0,    0],
        [ 101, 1000, 3110,  ...,    0,    0,    0],
        [ 101, 1000, 2035,  ...,    0,    0,    0],
        ...,
        [ 101, 2053, 2592,  ...,    0,    0,    0],
        [ 101, 2053, 2592,  ...,    0,    0,    0],
        [ 101, 2053, 2592,  ...,    0,    0,    0]])


In [8]:
# Split data
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.1, random_state=42
)

# Convert target labels to one-dimensional tensors
train_labels = torch.tensor(train_labels, dtype=torch.long)
val_labels = torch.tensor(val_labels, dtype=torch.long)


  train_labels = torch.tensor(train_labels, dtype=torch.long)
  val_labels = torch.tensor(val_labels, dtype=torch.long)


In [9]:
print(train_masks)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [10]:
# Filter out samples with unexpected label value
train_inputs = train_inputs[train_labels != -9223372036854775808]
train_masks = train_masks[train_labels != -9223372036854775808]
train_labels = train_labels[train_labels != -9223372036854775808]

val_inputs = val_inputs[val_labels != -9223372036854775808]
val_masks = val_masks[val_labels != -9223372036854775808]
val_labels = val_labels[val_labels != -9223372036854775808]
print("Unique values in train_labels:", train_labels.unique())
print("Unique values in val_labels:", val_labels.unique())


Unique values in train_labels: tensor([0, 1, 2])
Unique values in val_labels: tensor([0, 1, 2])


In [11]:
# DataLoaders, Model, Optimizer, Scheduler, Training loop, Evaluation, Classification report.
batch_size = 16

# Create DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [12]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels=len(category_map))
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                            num_training_steps=total_steps)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Training loop with validation loss calculation
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    # Training phase
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, 
                        labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_dataloader)
    print("Average training loss for Epoch {}: {:.2f}".format(epoch + 1, avg_train_loss))
    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, 
                            labels=b_labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()
    # Calculate average validation loss
    avg_val_loss = total_val_loss / len(val_dataloader)
    print("Average validation loss for Epoch {}: {:.2f}".format(epoch + 1, avg_val_loss))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████| 6/6 [01:39<00:00, 16.66s/it]


Average training loss for Epoch 1: 1.09
Average validation loss for Epoch 1: 1.01


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████████| 6/6 [01:27<00:00, 14.59s/it]


Average training loss for Epoch 2: 0.95
Average validation loss for Epoch 2: 0.92


Epoch 3: 100%|███████████████████████████████████████████████████████████████████████████| 6/6 [01:30<00:00, 15.05s/it]


Average training loss for Epoch 3: 0.87
Average validation loss for Epoch 3: 0.84


Epoch 4: 100%|███████████████████████████████████████████████████████████████████████████| 6/6 [01:49<00:00, 18.29s/it]


Average training loss for Epoch 4: 0.81
Average validation loss for Epoch 4: 0.81


In [13]:
model.eval()
total_val_loss = 0
    
with torch.no_grad():
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
            
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        val_loss = outputs.loss
        total_val_loss += val_loss.item()
    
    # Calculate average validation loss
avg_val_loss = total_val_loss / len(val_dataloader)
print("Average validation loss for Epoch {}: {:.2f}".format(epoch + 1, avg_val_loss))

Average validation loss for Epoch 4: 0.81


In [14]:
import pandas as pd
# Load the test set CSV file
test_data = pd.read_csv("valid.csv")  # Replace "test_set.csv" with the actual file path
# Iterate through each sentence in the test set
for input_sentence in test_data['test_sentence']:  # Replace 'sentence_column' with the actual column name containing the sentences
    # Tokenize the input sentence
    input_tokens = tokenizer.encode_plus(
        input_sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    # Move tensors to the appropriate device
    input_ids = input_tokens['input_ids'].to(device)
    attention_mask = input_tokens['attention_mask'].to(device)
    # Pass input through the model to get logits
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    # Get the predicted class index
    predicted_index = outputs.logits.argmax().item()
    # Map the predicted index to the corresponding category label
    predicted_category = list(category_map.keys())[predicted_index]
    print("Input sentence:", input_sentence)
    print("Predicted category:", predicted_category)


Input sentence: "I read an article discussing the potential benefits of Advil and Tylenol for managing minor aches and pains. I'm considering using them as needed."
Predicted category: "Medical Information"
Input sentence: "I bought a pack of antibiotics, but they were expired and ineffective."
Predicted category: "Product Quality"
Input sentence: "Read about the importance of adherence to anticoagulant therapy in preventing blood clots. Making sure to take my medication as prescribed."
Predicted category: "Medical Information"
Input sentence: "After opening the medicine bottle, I discovered that some of the pills were broken and crumbled, which made me concerned about their effectiveness and safety"
Predicted category: "Product Quality"
Input sentence: "Zyrtec seemed like a good choice for my allergies, but now I'm feeling overwhelmed and anxious."
Predicted category: "Adversal Effects"


In [15]:
model.eval()
val_loss = 0
val_accuracy = 0
nb_eval_steps = 0

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        
    logits = outputs.logits
    val_loss += loss.item()
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    val_accuracy += np.sum(np.argmax(logits, axis=1) == label_ids)
    nb_eval_steps += 1

val_loss = val_loss / nb_eval_steps
val_accuracy = val_accuracy / (len(val_dataloader.dataset))
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)


Validation Loss: 0.8083297610282898
Validation Accuracy: 0.8888888888888888


In [16]:
# Define the directory path where you want to save the model
output_dir = "./fine_tuned_bert_model1"

# Create the directory if it doesn't exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved successfully at:", output_dir)

Model saved successfully at: ./fine_tuned_bert_model1


In [None]:
input_sentence = "At first I suffered through them. This included splitting head pain, nausea, and vomiting. I started using Excedrin after a while which helped if I took it right away. Then that started to not work so well anymore. I had one really bad one that lasted hours. I was still throwing up at 9 pm and I was now throwing up blood. I went to the ER and when I finally got in they put me on an IV to hydrate me. They then added Imitrex to the bag and I soon started feeling side effects. My head felt like I was going to pass out. My breathing became labored and it felt like someone was sitting on my chest. They said that meant it was working. After about two minutes everything was gone. I got pills to take at home and they worked okay. Not as fast."
#input_sentence= "I bought a pack of antibiotics, but they were expired and ineffective."
#input_sentence= "I read an article discussing the potential benefits of Advil and Tylenol for managing minor aches and pains. I'm considering using them as needed."
input_sentence= "After opening the medicine bottle, I discovered that some of the pills were broken and crumbled, which made me concerned about their effectiveness and safety"
#input_sentence = "read in article Dolo 650 Tablet need to take twice docter gave me this advice"
# Tokenize the input sentence
input_tokens = tokenizer.encode_plus(
    input_sentence,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

# Move tensors to the appropriate device
input_ids = input_tokens['input_ids'].to(device)
attention_mask = input_tokens['attention_mask'].to(device)

# Pass input through the model to get logits
with torch.no_grad():
    outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)

# Get the predicted class index
predicted_index = outputs.logits.argmax().item()

# Map the predicted index to the corresponding category label
predicted_category = list(category_map.keys())[predicted_index]

print("Predicted category:", predicted_category)
