In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm

# Load the data
df = pd.read_csv('result.csv')

# Encode entity_name
le = LabelEncoder()
df['entity_name_encoded'] = le.fit_transform(df['entity_name'])

# Prepare input data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

input_ids = []
attention_masks = []

# Fill NaN values in 'extracted_text' with an empty string
df['extracted_text'] = df['extracted_text'].fillna('')

# Tokenize the texts
for text in df['extracted_text']:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',  # Use newer padding argument
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True  # Ensure truncation
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

# Convert to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['entity_name_encoded'].values)

# Split the data
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Create DataLoaders
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
# Set up the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_),
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss:.4f}')

print('Training complete!')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 1.1136


Epoch 2/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.8671


Epoch 3/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.7472


Epoch 4/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.6442


Epoch 5/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.5645


Epoch 6/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.5009


Epoch 7/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.4381


Epoch 8/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.3755


Epoch 9/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]


Average training loss: 0.3293


Epoch 10/10: 100%|██████████| 676/676 [06:42<00:00,  1.68it/s]

Average training loss: 0.2930
Training complete!





In [None]:
torch.save(model.state_dict(), 'bert_model.pth')
print('Model saved as bert_model.pth')

# Save the label encoder
import joblib
joblib.dump(le, 'label_encoder.joblib')
print('Label encoder saved as label_encoder.joblib')

Model saved as bert_model.pth
Label encoder saved as label_encoder.joblib


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    return accuracy, f1

accuracy, f1 = evaluate_model(model, val_dataloader)
print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Validation F1-score: {f1:.4f}')

Validation Accuracy: 0.6181
Validation F1-score: 0.6120


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import re

# Define the entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Create a dictionary to map abbreviated units to full units
unit_abbreviations = {
    'cm': 'centimetre', 'ft': 'foot', 'in': 'inch', 'm': 'metre', 'mm': 'millimetre', 'yd': 'yard',
    'g': 'gram', 'kg': 'kilogram', 'µg': 'microgram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound',
    'kV': 'kilovolt', 'mV': 'millivolt', 'V': 'volt',
    'kW': 'kilowatt', 'W': 'watt',
    'cl': 'centilitre', 'cu ft': 'cubic foot', 'cu in': 'cubic inch', 'dl': 'decilitre', 'fl oz': 'fluid ounce',
    'gal': 'gallon', 'l': 'litre', 'µl': 'microlitre', 'ml': 'millilitre', 'pt': 'pint', 'qt': 'quart'
}

# Load the trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(entity_unit_map))
model.load_state_dict(torch.load('bert_model.pth'), strict=False)
model.to(device)
model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the label encoder
import joblib
le = joblib.load('label_encoder.joblib')

# Load and preprocess the test_result.csv file
test_df = pd.read_csv('test_result.csv')

def extract_entity_value(text, entity_name):
    if pd.isna(text):
        return ""

    allowed_units = entity_unit_map[entity_name]
    pattern = r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+(?:\s+[a-zA-Z]+)?)'
    matches = re.findall(pattern, text)

    for value, unit in matches:
        full_unit = unit_abbreviations.get(unit.lower(), unit.lower())
        if full_unit in allowed_units:
            return f"{float(value)} {full_unit}"

    return ""

def predict_entity_value(text, entity_name):
    if pd.isna(text):
        return ""

    # Preprocess the text
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    try:
        predicted_entity_name = le.inverse_transform([predicted_class])[0]
    except ValueError:
        predicted_entity_name = entity_name

    # Extract entity value
    extracted_value = extract_entity_value(text, entity_name)

    return extracted_value if extracted_value else ""

# Make predictions
predictions = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Making predictions'):
    prediction = predict_entity_value(row['extracted_text'], row['entity_name'])
    predictions.append(prediction)

# Generate the final output CSV
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': predictions
})

output_df.to_csv('test_out.csv', index=False)
print("Results have been saved to test_out.csv")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('bert_model.pth'), strict=False)
Making predictions: 100%|██████████| 100000/100000 [18:29<00:00, 90.17it/s]


Results have been saved to test_out.csv
