In [7]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
df = pd.read_csv('train_ocr.csv')

# Prepare the data for question-answering
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def prepare_training_data(df):
    qa_examples = []
    for index, row in df.iterrows():
        context = str(row['extracted_text'])
        question = str(row['entity_name'])
        answer_text = str(row['entity_value'])

        if pd.isna(context) or pd.isna(question) or pd.isna(answer_text):
            continue

        # Find the answer in the context
        answer_start = context.lower().find(answer_text.lower())
        if answer_start == -1:
            # Skip if answer not found in context
            continue
        answer_end = answer_start + len(answer_text)

        qa_examples.append({
            'context': context,
            'question': question,
            'answer_text': answer_text,
            'answer_start': answer_start,
            'answer_end': answer_end
        })
    return qa_examples

qa_examples = prepare_training_data(df)

print(f"Total QA examples: {len(qa_examples)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Total QA examples: 313


In [4]:
# Create a custom dataset
class QADataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        encoding = self.tokenizer(
            example['question'],
            example['context'],
            truncation='only_second',
            max_length=self.max_length,
            padding='max_length',
            return_offsets_mapping=True,
            return_tensors='pt'
        )
        offset_mapping = encoding.pop('offset_mapping')[0]
        sequence_ids = encoding.token_type_ids[0]

        # Start and end character positions of the answer in the context
        answer_start_char = example['answer_start']
        answer_end_char = example['answer_end']

        # Find token start and end positions
        token_start_index = 0
        while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(sequence_ids) - 1
        while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Initialize start and end positions
        start_position = None
        end_position = None

        # Loop through token offsets to find the start and end token indices
        for i in range(token_start_index, token_end_index + 1):
            start_char, end_char = offset_mapping[i]
            if start_char <= answer_start_char < end_char:
                start_position = i
            if start_char < answer_end_char <= end_char:
                end_position = i
                break

        if start_position is None or end_position is None:
            # If answer not found, set to CLS token position
            start_position = tokenizer.cls_token_id
            end_position = tokenizer.cls_token_id

        encoding.update({
            'start_positions': torch.tensor(start_position),
            'end_positions': torch.tensor(end_position)
        })

        return {k: v.squeeze() for k, v in encoding.items()}

# Create dataset and dataloaders
dataset = QADataset(qa_examples, tokenizer)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Training

In [5]:

# Initialize the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

print("Training complete!")

# Save the model
model.save_pretrained('bert_qa_model')
tokenizer.save_pretrained('bert_qa_model')
print("Model saved to 'bert_qa_model' directory")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 36/36 [00:29<00:00,  1.22it/s]


Average training loss: 2.2328


Epoch 2/3: 100%|██████████| 36/36 [00:27<00:00,  1.29it/s]


Average training loss: 0.8545


Epoch 3/3: 100%|██████████| 36/36 [00:29<00:00,  1.24it/s]


Average training loss: 0.7325
Training complete!
Model saved to 'bert_qa_model' directory


In [6]:
# Validation function
def evaluate(model, val_loader):
    model.eval()
    total = 0
    correct = 0

    for batch in tqdm(val_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

        start_preds = torch.argmax(outputs.start_logits, dim=1)
        end_preds = torch.argmax(outputs.end_logits, dim=1)

        # Compare predictions with actual positions
        for i in range(len(start_preds)):
            # Check if predicted positions match the actual positions
            if start_preds[i] == start_positions[i] and end_preds[i] == end_positions[i]:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"Validation Accuracy: {accuracy:.4f}")

evaluate(model, val_loader)

Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.26it/s]

Validation Accuracy: 0.6875





# Modified Testing Code

In [8]:
# Define the entity_unit_map and unit abbreviations
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Define default units for each entity_name
default_units = {
    'width': 'centimetre',
    'depth': 'centimetre',
    'height': 'centimetre',
    'item_weight': 'gram',
    'maximum_weight_recommendation': 'gram',
    'voltage': 'volt',
    'wattage': 'watt',
    'item_volume': 'millilitre'
}

unit_abbreviations = {
    # Length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre', 'centimeters': 'centimetre',
    'm': 'metre', 'meter': 'metre', 'meters': 'metre', 'metre': 'metre', 'metres': 'metre',
    'mm': 'millimetre', 'millimeter': 'millimetre', 'millimeters': 'millimetre', 'millimetre': 'millimetre', 'millimetres': 'millimetre',
    'in': 'inch', 'inch': 'inch', 'inches': 'inch',
    'ft': 'foot', 'foot': 'foot', 'feet': 'foot',
    'yd': 'yard', 'yard': 'yard', 'yards': 'yard',

    # Weight units
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram', 'kilograms': 'kilogram',
    'mg': 'milligram', 'milligram': 'milligram', 'milligrams': 'milligram',
    'µg': 'microgram', 'microgram': 'microgram', 'micrograms': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce', 'ounces': 'ounce',
    'lb': 'pound', 'pound': 'pound', 'pounds': 'pound',
    't': 'ton', 'ton': 'ton', 'tons': 'ton',

    # Voltage units
    'V': 'volt', 'volt': 'volt', 'volts': 'volt',
    'kV': 'kilovolt', 'kilovolt': 'kilovolt', 'kilovolts': 'kilovolt',
    'mV': 'millivolt', 'millivolt': 'millivolt', 'millivolts': 'millivolt',

    # Power units
    'W': 'watt', 'watt': 'watt', 'watts': 'watt',
    'kW': 'kilowatt', 'kilowatt': 'kilowatt', 'kilowatts': 'kilowatt',

    # Volume units
    'ml': 'millilitre', 'milliliter': 'millilitre', 'milliliters': 'millilitre', 'millilitre': 'millilitre', 'millilitres': 'millilitre',
    'l': 'litre', 'liter': 'litre', 'liters': 'litre', 'litre': 'litre', 'litres': 'litre',
    'cl': 'centilitre', 'centiliter': 'centilitre', 'centiliters': 'centilitre', 'centilitre': 'centilitre', 'centilitres': 'centilitre',
    'µl': 'microlitre', 'microliter': 'microlitre', 'microliters': 'microlitre', 'microlitre': 'microlitre', 'microlitres': 'microlitre',
    'fl oz': 'fluid ounce', 'fluid ounce': 'fluid ounce', 'fluid ounces': 'fluid ounce',
    'pt': 'pint', 'pint': 'pint', 'pints': 'pint',
    'qt': 'quart', 'quart': 'quart', 'quarts': 'quart',
    'gal': 'gallon', 'gallon': 'gallon', 'gallons': 'gallon',
    'cu ft': 'cubic foot', 'cubic foot': 'cubic foot', 'cubic feet': 'cubic foot',
    'cu in': 'cubic inch', 'cubic inch': 'cubic inch', 'cubic inches': 'cubic inch',
}

allowed_units = {unit for units in entity_unit_map.values() for unit in units}

In [9]:
# Load the trained model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert_qa_model')
model = BertForQuestionAnswering.from_pretrained('bert_qa_model')

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Load the test data
test_df = pd.read_csv('test_ocr.csv')

predictions = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Predicting'):
    context = str(row['extracted_text'])
    question = str(row['entity_name'])
    index = row['index']

    if pd.isna(context) or pd.isna(question):
        predictions.append('')
        continue

    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        return_tensors='pt',
        truncation='only_second',
        max_length=512,
        padding='max_length'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most probable start and end positions
    start_index = torch.argmax(start_logits, dim=1).item()
    end_index = torch.argmax(end_logits, dim=1).item()

    # Ensure start_index <= end_index
    if start_index > end_index:
        predictions.append('')
        continue

    # Convert token indices back to text
    tokens = input_ids[0][start_index:end_index + 1]
    answer = tokenizer.decode(tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Clean up the answer
    answer = answer.strip()

    # Now process the answer to extract number and unit
    def extract_number_and_unit(answer_text, entity_name):
        # Regex pattern to match numbers and units
        pattern = r'([-+]?\d*\.\d+|\d+)\s*([a-zA-Zµ]+(?:\s+[a-zA-Z]+)?)?'
        matches = re.findall(pattern, answer_text)

        allowed_units_for_entity = entity_unit_map.get(entity_name, allowed_units)
        default_unit = default_units.get(entity_name, '')

        for value, unit in matches:
            if unit:
                unit_lower = unit.lower().strip()
                # Map unit abbreviation to full unit
                full_unit = unit_abbreviations.get(unit_lower, unit_lower)
                if full_unit in allowed_units_for_entity:
                    return f"{float(value)} {full_unit}"
            else:
                # If no unit found, assign default unit
                if default_unit:
                    return f"{float(value)} {default_unit}"
                else:
                    return f"{float(value)}"

        # If no valid number and unit found, but default unit is available
        if default_unit and matches:
            value = matches[0][0]
            return f"{float(value)} {default_unit}"

        # If no valid number and unit found, return empty string
        return ""

    extracted_value = extract_number_and_unit(answer, question)
    predictions.append(extracted_value)

# Generate the final output CSV
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': predictions
})

output_df.to_csv('test_out.csv', index=False)
print("Results have been saved to 'test_out.csv'")

Predicting: 100%|██████████| 100/100 [00:03<00:00, 27.46it/s]

Results have been saved to 'test_out.csv'



