In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
from src.constants import entity_unit_map
import ast

# Define constants
BATCH_SIZE = 32
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
TRAIN_IMAGES_DIR = 'train_images'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom dataset
class ProductImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        self.entity_names = sorted(self.data['entity_name'].unique())
        self.entity_name_to_idx = {name: idx for idx, name in enumerate(self.entity_names)}
        self.entity_units = {entity: sorted(units) for entity, units in entity_unit_map.items()}
        self.max_units = max(len(units) for units in self.entity_units.values())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.basename(self.data.iloc[idx]['image_link'])
        img_path = os.path.join(self.img_dir, img_name)
        img = Image.open(img_path).convert('RGB')
        
        if self.transform:
            img = self.transform(img)
        
        entity_name = self.data.iloc[idx]['entity_name']
        entity_value, entity_unit = self.parse_entity_value(self.data.iloc[idx]['entity_value'])
        
        entity_name_tensor = torch.zeros(len(self.entity_names), dtype=torch.float32)
        entity_name_tensor[self.entity_name_to_idx[entity_name]] = 1
        
        entity_unit_tensor = torch.zeros(self.max_units, dtype=torch.float32)
        if entity_name in self.entity_units and entity_unit in self.entity_units[entity_name]:
            unit_idx = self.entity_units[entity_name].index(entity_unit)
            entity_unit_tensor[unit_idx] = 1
        
        return img, entity_name_tensor, torch.tensor(entity_value, dtype=torch.float32), entity_unit_tensor, entity_name

    def parse_entity_value(self, value_str):
        try:
            value, unit = value_str.rsplit(' ', 1)
            if unit == 'fluid':
                value, unit = value_str.rsplit(' ', 2)[0], ' '.join(value_str.rsplit(' ', 2)[1:])
            return float(value), unit
        except ValueError:
            try:
                value_list = ast.literal_eval(value_str)
                if isinstance(value_list, list) and len(value_list) == 2:
                    return sum(value_list) / 2, 'unknown'
            except:
                pass
        
        return 0.0, 'unknown'

# Data transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load data
train_dataset = ProductImageDataset('dataset/train.csv', TRAIN_IMAGES_DIR, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

# Custom model
class ProductImageModel(nn.Module):
    def __init__(self, num_entity_names, max_units):
        super(ProductImageModel, self).__init__()
        self.resnet = models.resnet152(weights=models.ResNet152_Weights.IMAGENET1K_V1)
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        
        self.fc_entity_name = nn.Linear(num_ftrs, num_entity_names).float()
        self.fc_entity_value = nn.Linear(num_ftrs, 1).float()
        self.fc_entity_units = nn.Linear(num_ftrs, max_units).float()

    def forward(self, x):
        features = self.resnet(x)
        entity_name_out = self.fc_entity_name(features)
        entity_value_out = self.fc_entity_value(features).squeeze(1)
        entity_units_out = self.fc_entity_units(features)
        return entity_name_out, entity_value_out, entity_units_out

# Initialize model
model = ProductImageModel(len(train_dataset.entity_names), train_dataset.max_units)
model.to(DEVICE)

# Loss and optimizer
criterion_classification = nn.BCEWithLogitsLoss()
criterion_regression = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}')
    
    for images, entity_names, entity_values, entity_units, entity_name_str in progress_bar:
        images = images.to(DEVICE).float()
        entity_names = entity_names.to(DEVICE).float()
        entity_values = entity_values.to(DEVICE).float()
        entity_units = entity_units.to(DEVICE).float()
        
        # Forward pass
        entity_name_out, entity_value_out, entity_units_out = model(images)
        
        # Compute losses
        loss_entity_name = criterion_classification(entity_name_out, entity_names)
        loss_entity_value = criterion_regression(entity_value_out, entity_values)
        loss_entity_units = criterion_classification(entity_units_out, entity_units)
        
        loss = loss_entity_name + loss_entity_value + loss_entity_units
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}')
    
    # Update learning rate
    scheduler.step(avg_loss)

# Save the model
torch.save({
    'epoch': NUM_EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': avg_loss,
}, 'product_image_model.pth')

print("Training completed. Model saved.")

# Evaluation and prediction function
def predict(model, image):
    model.eval()
    with torch.no_grad():
        entity_name_out, entity_value_out, entity_units_out = model(image.unsqueeze(0).to(DEVICE).float())
        
        predicted_entity = train_dataset.entity_names[torch.argmax(entity_name_out).item()]
        predicted_value = entity_value_out.item()
        predicted_unit_idx = torch.argmax(entity_units_out).item()
        predicted_unit = train_dataset.entity_units.get(predicted_entity, ['unknown'])[predicted_unit_idx] if predicted_unit_idx < len(train_dataset.entity_units.get(predicted_entity, [])) else 'unknown'
        
        return predicted_entity, predicted_value, predicted_unit

# Evaluation
model.eval()
correct_predictions = 0
total_samples = 0

predictions = []
with torch.no_grad():
    for images, _, _, _, entity_name_str in tqdm(train_loader, desc="Evaluating"):
        images = images.to(DEVICE).float()
        
        for i, image in enumerate(images):
            predicted_entity, predicted_value, predicted_unit = predict(model, image)
            formatted_prediction = f"{predicted_value:.2f} {predicted_unit}"
            predictions.append(formatted_prediction)
            
            if predicted_entity == entity_name_str[i]:
                correct_predictions += 1
            total_samples += 1

accuracy = correct_predictions / total_samples
print(f"Evaluation Results:")
print(f"Classification Accuracy: {accuracy:.4f}")

# Save predictions
test_df = pd.read_csv('dataset/test.csv')
test_df['prediction'] = predictions[:len(test_df)]  # Ensure we have the correct number of predictions
test_df[['index', 'prediction']].to_csv('test_out.csv', index=False)

print("Predictions saved to test_out.csv")

---

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
import pandas as pd
import os
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import ast

# Allow loading truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Define constants
BATCH_SIZE = 128
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
TRAIN_IMAGES_DIR = 'train_images'
TEST_IMAGES_DIR = 'test_images'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ACCUMULATION_STEPS = 2
CHECKPOINT_PATH = 'product_image_model_checkpoint.pth'
MODEL_PATH = 'product_image_model_final.pth'
TRAIN_CSV_PATH = 'dataset/train.csv'
TEST_CSV_PATH = 'dataset/test.csv'
OUTPUT_CSV_PATH = 'dataset/test_out.csv'

# Load entity_unit_map
with open('src/constants.py', 'r') as f:
    exec(f.read())

# Preprocessing function
def preprocess_data(csv_file):
    data = pd.read_csv(csv_file)
    original_count = len(data)
    
    def is_valid_sample(row):
        try:
            value_str = row['entity_value']
            entity_name = row['entity_name']
            value, unit = value_str.rsplit(' ', 1)
            if unit == 'fluid':
                value, unit = value_str.rsplit(' ', 2)[0], ' '.join(value_str.rsplit(' ', 2)[1:])
            float(value)  # Check if value can be converted to float
            return unit in entity_unit_map.get(entity_name, [])
        except:
            return False

    filtered_data = data[data.apply(is_valid_sample, axis=1)]
    filtered_count = len(filtered_data)
    
    print(f"Original sample count: {original_count}")
    print(f"Filtered sample count: {filtered_count}")
    print(f"Removed {original_count - filtered_count} samples ({(original_count - filtered_count) / original_count:.2%})")
    
    return filtered_data

# Custom dataset
class ProductImageDataset(Dataset):
    def __init__(self, data, img_dir, transform=None):
        self.data = data
        self.img_dir = img_dir
        self.transform = transform
        self.entity_names = sorted(self.data['entity_name'].unique())
        self.entity_name_to_idx = {name: idx for idx, name in enumerate(self.entity_names)}
        self.entity_units = {entity: sorted(units) for entity, units in entity_unit_map.items()}
        self.max_units = max(len(units) for units in self.entity_units.values())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.basename(self.data.iloc[idx]['image_link'])
        img_path = os.path.join(self.img_dir, img_name)
        try:
            img = Image.open(img_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {img_path}: {str(e)}")
            img = Image.new('RGB', (224, 224), color='black')
        
        if self.transform:
            img = self.transform(img)
        
        entity_name = self.data.iloc[idx]['entity_name']
        
        # Check if 'entity_value' column exists in the DataFrame
        if 'entity_value' in self.data.columns:
            entity_value, entity_unit = self.parse_entity_value(self.data.iloc[idx]['entity_value'])
        else:
            # If 'entity_value' doesn't exist, use placeholder values
            entity_value, entity_unit = 0.0, 'unknown'
        
        entity_name_tensor = torch.zeros(len(self.entity_names), dtype=torch.float32)
        entity_name_tensor[self.entity_name_to_idx[entity_name]] = 1
        
        entity_unit_tensor = torch.zeros(self.max_units, dtype=torch.float32)
        if entity_unit in self.entity_units.get(entity_name, []):
            unit_idx = self.entity_units[entity_name].index(entity_unit)
            entity_unit_tensor[unit_idx] = 1
        
        return img, entity_name_tensor, torch.tensor(entity_value, dtype=torch.float32), entity_unit_tensor, entity_name

    def parse_entity_value(self, value_str):
        value, unit = value_str.rsplit(' ', 1)
        if unit == 'fluid':
            value, unit = value_str.rsplit(' ', 2)[0], ' '.join(value_str.rsplit(' ', 2)[1:])
        return float(value), unit

# Custom model
class ProductImageModel(nn.Module):
    def __init__(self, num_entity_names, max_units):
        super(ProductImageModel, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        
        self.fc_entity_name = nn.Linear(num_ftrs, num_entity_names).float()
        self.fc_entity_value = nn.Linear(num_ftrs, 1).float()
        self.fc_entity_units = nn.Linear(num_ftrs, max_units).float()

    def forward(self, x):
        features = self.resnet(x)
        entity_name_out = self.fc_entity_name(features)
        entity_value_out = self.fc_entity_value(features).squeeze(1)
        entity_units_out = self.fc_entity_units(features)
        return entity_name_out, entity_value_out, entity_units_out

# Training function
def train_model():
    # Data transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Preprocess and load data
    filtered_data = preprocess_data(TRAIN_CSV_PATH)
    train_dataset = ProductImageDataset(filtered_data, TRAIN_IMAGES_DIR, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

    # Initialize model
    model = ProductImageModel(len(train_dataset.entity_names), train_dataset.max_units)
    model.to(DEVICE)

    # Loss and optimizer
    criterion_classification = nn.BCEWithLogitsLoss()
    criterion_regression = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

    # Initialize the GradScaler for mixed precision training
    scaler = GradScaler()

    # Load checkpoint if it exists
    start_epoch = 0
    if os.path.exists(CHECKPOINT_PATH):
        checkpoint = torch.load(CHECKPOINT_PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        scaler.load_state_dict(checkpoint['scaler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Resuming training from epoch {start_epoch}")
    else:
        print("Starting training from scratch")

    # Training loop
    for epoch in range(start_epoch, NUM_EPOCHS):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}')
        
        for i, (images, entity_names, entity_values, entity_units, entity_name_str) in enumerate(progress_bar):
            images = images.to(DEVICE)
            entity_names = entity_names.to(DEVICE)
            entity_values = entity_values.to(DEVICE)
            entity_units = entity_units.to(DEVICE)
            
            # Mixed precision training
            with autocast():
                # Forward pass
                entity_name_out, entity_value_out, entity_units_out = model(images)
                
                # Compute losses
                loss_entity_name = criterion_classification(entity_name_out, entity_names)
                loss_entity_value = criterion_regression(entity_value_out, entity_values)
                loss_entity_units = criterion_classification(entity_units_out, entity_units)
                
                loss = loss_entity_name + loss_entity_value + loss_entity_units
                loss = loss / ACCUMULATION_STEPS  # Normalize the loss

            # Backward and optimize
            scaler.scale(loss).backward()
            
            if (i + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            total_loss += loss.item() * ACCUMULATION_STEPS
            progress_bar.set_postfix({'loss': loss.item() * ACCUMULATION_STEPS})
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}')
        
        # Update learning rate
        scheduler.step(avg_loss)

        # Save checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'loss': avg_loss,
        }, CHECKPOINT_PATH)
        print(f"Checkpoint saved at epoch {epoch+1}")

    # Save the final model
    torch.save({
        'epoch': NUM_EPOCHS,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_loss,  # This line is now safe as avg_loss is defined in the loop
    }, MODEL_PATH)

    print("Training completed. Final model saved.")

# Prediction function
@torch.no_grad()
def predict(model, image, train_dataset):
    model.eval()
    with autocast():
        entity_name_out, entity_value_out, entity_units_out = model(image.unsqueeze(0).to(DEVICE))
        
        predicted_entity = train_dataset.entity_names[torch.argmax(entity_name_out).item()]
        predicted_value = entity_value_out.item()
        predicted_unit_idx = torch.argmax(entity_units_out).item()
        predicted_unit = train_dataset.entity_units[predicted_entity][predicted_unit_idx] if predicted_unit_idx < len(train_dataset.entity_units[predicted_entity]) else 'unknown'
        
        return predicted_entity, predicted_value, predicted_unit

# Predictor function for the sample code
def predictor(image_link, category_id, entity_name, model, train_dataset, transform):
    try:
        img = Image.open(image_link).convert('RGB')
        img_tensor = transform(img).unsqueeze(0).to(DEVICE)

        predicted_entity, predicted_value, predicted_unit = predict(model, img_tensor, train_dataset)
        formatted_prediction = f"{predicted_value:.2f} {predicted_unit}"
        return formatted_prediction
    except Exception as e:
        print(f"Error processing image {image_link}: {str(e)}")
        return ""

# Main function
def main():
    # Train the model
    # train_model()

    # Load the trained model for prediction
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = ProductImageDataset(pd.read_csv(TRAIN_CSV_PATH), TRAIN_IMAGES_DIR, transform=transform)
    test_dataset = ProductImageDataset(pd.read_csv(TEST_CSV_PATH), TEST_IMAGES_DIR, transform=transform)
    
    model = ProductImageModel(len(train_dataset.entity_names), train_dataset.max_units)
    checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(DEVICE)
    model.eval()

    # Make predictions on the test set
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    predictions = []

    with torch.no_grad():
        for images, _, _, _, _ in tqdm(test_loader, desc="Predicting"):
            images = images.to(DEVICE)
            
            for image in images:
                predicted_entity, predicted_value, predicted_unit = predict(model, image, train_dataset)
                formatted_prediction = f"{predicted_value:.2f} {predicted_unit}"
                predictions.append(formatted_prediction)
                print(f"Predicted: {formatted_prediction}")  # Print each prediction

    # Save predictions
    test_df = pd.read_csv(TEST_CSV_PATH)
    test_df['prediction'] = predictions
    test_df[['index', 'prediction']].to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"Predictions saved to {OUTPUT_CSV_PATH}")

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
import random
import os

# Load the TrOCR model and processor
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-stage1')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-stage1')

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using TrOCR
def extract_text(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    return generated_text

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=5):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using TrOCR
        extracted_text = extract_text(image_path)
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    process_random_samples()

# Reference: https://huggingface.co/microsoft/trocr-base-stage1

In [None]:
import pytesseract
from PIL import Image
import pandas as pd
import random
import os
import logging

# Configure logging
logging.basicConfig(filename='ocr_errors.log', level=logging.ERROR)

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using Tesseract OCR
def extract_text(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {str(e)}")
        return None

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=5):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        extracted_text = extract_text(image_path)
        
        if extracted_text is not None:
            print(f"Image: {row['image_link']}")
            print(f"Entity Name: {row['entity_name']}")
            print(f"Actual Label: {row['entity_value']}")
            print(f"Extracted Text: {extracted_text}")
            print("-" * 50)
        else:
            print(f"Failed to process image: {row['image_link']}")

# Run the main function
if __name__ == "__main__":
    try:
        process_random_samples()
    except pytesseract.TesseractNotFoundError:
        print("Tesseract is not installed or not in your PATH.")
        print("Please install Tesseract using your package manager.")
        print("For Ubuntu/Debian: sudo apt-get install tesseract-ocr")
        print("For CentOS/RHEL: sudo yum install tesseract")
        print("For more information, visit: https://github.com/tesseract-ocr/tessdoc/blob/main/Installation.md")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        logging.error(f"Unexpected error in main execution: {str(e)}")

In [None]:
import easyocr
import pandas as pd
import random
import os

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])  # Adjust languages as needed

# Function to extract text from an image using EasyOCR
def extract_text(image_path):
    result = reader.readtext(image_path)
    return ' '.join([text for _, text, _ in result])

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=5):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using EasyOCR
        extracted_text = extract_text(image_path)
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    process_random_samples()

In [None]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
import random
import os

# Load the Donut model and processor
processor = DonutProcessor.from_pretrained("jinhybr/OCR-Donut-CORD")
model = VisionEncoderDecoderModel.from_pretrained("jinhybr/OCR-Donut-CORD")

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using Donut
def extract_text(image_path, entity_name):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values

    task_prompt = "<s_cord-v2>"
    # task_prompt = f"<s_docvqa><s_question>Extract all {entity_name} measurement quantities with units and numerical quantity of the unit, strictly numeircal measurement quantities with units only</s_question><s_answer>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = processor.token2json(sequence)

    return sequence

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=15):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using Donut
        extracted_text = extract_text(image_path, row['entity_name'])
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    process_random_samples()

In [None]:
import re
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
import random
import os

# Load the Donut model and processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", use_fast=False)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using Donut
def extract_text(image_path, entity_name):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # Prepare decoder inputs with a suitable prompt for our task
    task_prompt = "<s_rvlcdip>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    
    return sequence

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=15):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using Donut
        extracted_text = extract_text(image_path, row['entity_name'])
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    process_random_samples()

In [None]:
classes = """entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}"""

In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import pandas as pd
import random
import os
from constants import entity_unit_map  # Import entity_unit_map from constants.py

# Load the LLaVA model and processor
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Add <image> token to tokenizer and resize model embeddings
processor.tokenizer.add_tokens(["<image>"], special_tokens=True)
model.resize_token_embeddings(len(processor.tokenizer))

# Set up multi-GPU processing if available
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using LLaVA
def extract_text(image_path, entity_name):
    image = Image.open(image_path).convert("RGB")
    
    # Get the specific units for the given entity_name
    entity_units = entity_unit_map.get(entity_name, [])
    units_str = ", ".join(entity_units) if entity_units else "unknown"
    
    prompt = f"""USER: <image>
Extract numerical quantities and their corresponding unit belonging to the class `{entity_name}` from the image. 

Output a JSON in the following format:
```
{{
    [
        {{
            "value": `Double Float`,
            "unit": `String` (one of: {units_str})
        }},
        <Repeat for other quantities>
    ]
}}
ASSISTANT:"""
    
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)

    with torch.cuda.amp.autocast():  # Use mixed precision
        if isinstance(model, torch.nn.DataParallel):
            outputs = model.module.generate(**inputs, max_new_tokens=300)
        else:
            outputs = model.generate(**inputs, max_new_tokens=300)
    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    
    return generated_text.strip()

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=20):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using LLaVA
        extracted_text = extract_text(image_path, row['entity_name'])
        # Extract only the ASSISTANT message from the extracted text
        assistant_message = extracted_text.split("ASSISTANT:")[-1].strip()
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {assistant_message}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    torch.cuda.empty_cache()  # Clear CUDA cache before running
    process_random_samples()

In [None]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
import random
import os

# Load the Donut model and processor
# processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
# model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
# processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
# model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load the train.csv file
train_df = pd.read_csv('dataset/train.csv')

# Set the path to your train images directory
TRAIN_IMAGES_DIR = 'train_images'

# Function to extract text from an image using Donut
def extract_text(image_path, entity_name):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # task_prompt = "<s_rvlcdip>"
    # task_prompt = "<s_cord-v2>"
    task_prompt = f"<s_docvqa><s_question> Extract measurement quantities with units and numerical quantity of the unit corresponding to {entity_name} </s_question><s_answer>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )
    print(processor.batch_decode(outputs.sequences))
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = processor.token2json(sequence)
    
    return sequence

# Function to get random samples from the train set
def get_random_samples(n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(num_samples=15):
    samples = get_random_samples(num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(TRAIN_IMAGES_DIR, os.path.basename(row['image_link']))
        
        # Extract text using Donut
        extracted_text = extract_text(image_path, row['entity_name'])
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    process_random_samples()

In [None]:
from transformers import MgpstrProcessor, MgpstrForSceneTextRecognition
import requests
from PIL import Image

# Load the MGP-STR processor and model
processor = MgpstrProcessor.from_pretrained('alibaba-damo/mgp-str-base')
model = MgpstrForSceneTextRecognition.from_pretrained('alibaba-damo/mgp-str-base')

# Function to extract text from an image using MGP-STR
def extract_text(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    # Perform inference
    outputs = model(pixel_values)

    # Decode the output
    result = processor.batch_decode(outputs.logits)
    return result['generated_text']

# Function to get random samples from the train set
def get_random_samples(train_df, n=5):
    return train_df.sample(n)

# Main function to process random samples
def process_random_samples(train_df, train_images_dir, num_samples=5):
    samples = get_random_samples(train_df, num_samples)
    
    for _, row in samples.iterrows():
        image_path = os.path.join(train_images_dir, os.path.basename(row['image_link']))
        
        # Extract text using MGP-STR
        extracted_text = extract_text(image_path)
        
        print(f"Image: {row['image_link']}")
        print(f"Entity Name: {row['entity_name']}")
        print(f"Actual Label: {row['entity_value']}")
        print(f"Extracted Text: {extracted_text}")
        print("-" * 50)

# Run the main function
if __name__ == "__main__":
    import pandas as pd
    import os

    # Load the train.csv file
    train_df = pd.read_csv('dataset/train.csv')

    # Set the path to your train images directory
    TRAIN_IMAGES_DIR = 'train_images'

    process_random_samples(train_df, TRAIN_IMAGES_DIR)

In [None]:

import torch
from transformers import pipeline
from PIL import Image
import pandas as pd
import os
import hjson
from tqdm import tqdm
from constants import entity_unit_map
from sanity import sanity_check
from src.utils import download_images
os.environ["TOKENIZERS_PARALLELISM"] = True
# Try to import bitsandbytes and set up quantization
try:
    import bitsandbytes
    from transformers import BitsAndBytesConfig
    
    # Set up quantization configuration
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    # Set up the pipeline with quantization
    model_id = "llava-hf/llava-1.5-7b-hf"
    pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
    print("Using 4-bit quantization.")
except ImportError:
    print("bitsandbytes is not installed. Falling back to default configuration.")
    
    # Set up the pipeline without quantization
    model_id = "llava-hf/llava-1.5-7b-hf"
    pipe = pipeline("image-to-text", model=model_id)
    print("Using default configuration without quantization.")

# Load the test.csv file
test_df = pd.read_csv('dataset/test.csv')

# Set the path to your test images directory
TEST_IMAGES_DIR = 'test_images'

# Download test images if not already downloaded
download_images(test_df, TEST_IMAGES_DIR)

def extract_text(image_path, entity_name):
    image = Image.open(image_path).convert("RGB")
    
    entity_units = entity_unit_map.get(entity_name, [])
    units_str = ", ".join(entity_units) if entity_units else "unknown"
    
    prompt = f"""USER: <image>
Extract numerical quantities and their corresponding unit belonging to the class `{entity_name}` from the image. 

Output a JSON in the following format:
```
{{
    "predictions": [
        {{
            "value": <Double Float>,
            "unit": <String> (one of: {units_str})
        }},
        <Repeat for other quantities> (max 3)
    ]
}}
```
If no relevant information is found, return an empty list.
ASSISTANT:"""

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 1024})
    return outputs[0]['generated_text']

def process_extracted_text(text):
    try:
        # Extract the JSON part from the response
        json_str = text.split("```")[1].strip()
        data = hjson.loads(json_str)
        predictions = data.get("predictions", [])
        
        if predictions:
            # Perform max voting
            value_unit_pairs = [(pred['value'], pred['unit']) for pred in predictions]
            if value_unit_pairs:
                most_common = max(set(value_unit_pairs), key=value_unit_pairs.count)
                return f"{most_common[0]:.2f} {most_common[1]}"
            else:
                # If no pairs found, return the first entry
                pred = predictions[0]
                return f"{pred['value']:.2f} {pred['unit']}"
        else:
            return ""
    except:
        return ""

def process_test_set():
    predictions = []
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        image_path = os.path.join(TEST_IMAGES_DIR, os.path.basename(row['image_link']))
        
        extracted_text = extract_text(image_path, row['entity_name'])
        prediction = process_extracted_text(extracted_text)
        predictions.append(prediction)
        print(f"Predicted: {prediction}")
        
    test_df['prediction'] = predictions
    output_df = test_df[['index', 'prediction']]
    output_df.to_csv('test_out.csv', index=False)
    print("Predictions saved to test_out.csv")

    # Run sanity check
    sanity_check('dataset/test.csv', 'test_out.csv')

if __name__ == "__main__":
    torch.cuda.empty_cache()
    process_test_set()


In [1]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import pandas as pd
import os
import json
from tqdm import tqdm
from constants import entity_unit_map
from sanity import sanity_check
from src.utils import download_images
from torch.utils.data import Dataset, DataLoader
os.environ["TOKENIZERS_PARALLELISM"] = 'true'
# Device selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the LLaVA model and processor
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

processor.tokenizer.add_tokens(["<image>"], special_tokens=True)
model.resize_token_embeddings(len(processor.tokenizer))

# Custom dataset
class ImageDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_dir, os.path.basename(row['image_link']))
        image = Image.open(image_path).convert("RGB")
        entity_name = row['entity_name']
        
        entity_units = entity_unit_map.get(entity_name, [])
        units_str = ", ".join(entity_units) if entity_units else "unknown"
        
        prompt = f"""USER: <image>
Extract numerical quantities and their corresponding unit belonging to the class `{entity_name}` from the image. 

Output a JSON in the following format:
```
{{
    "predictions": [
        {{
            "value": <Double Float>,
            "unit": <String> (one of: {units_str})
        }},
        <Repeat for other quantities> (max 3)
    ]
}}
```
If no relevant information is found, return an empty list.
ASSISTANT:"""
        
        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
        return {
            'pixel_values': inputs.pixel_values.squeeze(),
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'image_path': image_path,
            'entity_name': entity_name
        }

# Batch processing function
def process_batch(batch, model, processor):
    pixel_values = batch['pixel_values'].to(device)
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.cuda.amp.autocast():
        outputs = model.generate(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=4096
        )
    
    generated_texts = processor.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

def process_extracted_text(text):
    try:
        json_start = text.find('ASSISTANT:') + len('ASSISTANT:')
        json_text = text[json_start:].strip()
        
        import hjson
        data = hjson.loads(json_text)
        predictions = data.get("predictions", [])
        
        if predictions:
            value_unit_pairs = [(pred['value'], pred['unit']) for pred in predictions]
            if value_unit_pairs:
                most_common = max(set(value_unit_pairs), key=value_unit_pairs.count)
                return f"{most_common[0]:.2f} {most_common[1]}"
            else:
                pred = predictions[0]
                return f"{pred['value']:.2f} {pred['unit']}"
        else:
            return ""
    except hjson.HjsonDecodeError:
        print(f"Failed to parse Hjson: {json_text}")
        return ""
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

def process_test_set():
    # Load the test.csv file
    test_df = pd.read_csv('dataset/test.csv')
    
    # Set the path to your test images directory
    TEST_IMAGES_DIR = 'test_images'
    
    # Download test images if not already downloaded
    download_images(test_df, TEST_IMAGES_DIR)
    
    # Create dataset and dataloader
    dataset = ImageDataset(test_df, TEST_IMAGES_DIR, processor)
    dataloader = DataLoader(dataset, batch_size=4, num_workers=4, pin_memory=True)
    
    predictions = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        generated_texts = process_batch(batch, model, processor)
        
        for text in generated_texts:
            prediction = process_extracted_text(text)
            predictions.append(prediction)
            print(f"Predicted: {prediction}")
    
    test_df['prediction'] = predictions
    output_df = test_df[['index', 'prediction']]
    output_df.to_csv('test_out.csv', index=False)
    print("Predictions saved to test_out.csv")

    # Run sanity check
    sanity_check('dataset/test.csv', 'test_out.csv')

if __name__ == "__main__":
    torch.cuda.empty_cache()
    process_test_set()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.40it/s]
  0%|          | 4/131187 [00:09<82:12:53,  2.26s/it] 
  with torch.cuda.amp.autocast():
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
  0%|          | 1/32797 [00:08<81:54:09,  8.99s/it]

Predicted: 2000.00 metre
Predicted: 5.56 centimetre
Predicted: 1.50 yard
Predicted: 5.56 centimetre


  0%|          | 2/32797 [00:16<75:13:44,  8.26s/it]

Predicted: 13.13 metre
Predicted: 13.13 metre
Predicted: 13.13 metre
Predicted: 1250.00 foot





RuntimeError: Caught RuntimeError in DataLoader worker process 2.
Original Traceback (most recent call last):
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 317, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 155, in collate
    clone.update({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 155, in <dictcomp>
    clone.update({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 142, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.conda/envs/ml/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 214, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: stack expects each tensor to be equal size, but got [133] at entry 0 and [137] at entry 2
