In [None]:
import torch
import numpy as np
import cv2
from torchvision import transforms
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as models
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [22]:

import torch
x = torch.rand(2, 2).cuda()
print(x)


tensor([[0.4754, 0.0326],
        [0.6021, 0.3396]], device='cuda:0')


In [10]:
dsa = load_dataset("howard-hou/COCO-Text",split='train')

In [27]:
class EASTDataset(Dataset):
    def __init__(self, dataset, transform=None):
        """
        Args:
            dataset (Dataset): The dataset containing the images and annotations.
            transform (callable, optional): Optional transform to be applied on an image.
        """
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Load image and annotations
        sample = self.dataset[idx]
        image = sample['image']
        ocr_info = sample['ocr_info']

        # Convert image to RGB (in case it's in another format)
        image = np.array(image)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert to BGR for OpenCV

        # Resize image to fit EAST model input size (e.g., 512x512)
        target_size = (512, 512)
        image_resized = cv2.resize(image, target_size)

        # Initialize a blank mask for text regions
        height, width = target_size
        text_mask = np.zeros((height, width), dtype=np.float32)

        # Initialize a list for bounding boxes
        bboxes = []

        for ann in ocr_info:
            # Check if 'bbox' key exists before accessing it
            if 'bounding_box' in ann:
                bbox = ann['bounding_box']  # Format: [x, y, width, height]
                x, y, w, h = bbox

                # Scale bounding box to match resized image
                x = int(x * (width / sample['image_width']))
                y = int(y * (height / sample['image_height']))
                w = int(w * (width / sample['image_width']))
                h = int(h * (height / sample['image_height']))

                # Draw a rectangle on the text mask
                cv2.rectangle(text_mask, (x, y), (x + w, y + h), (1), -1)
                bboxes.append([x, y, w, h])
            # Handle cases where 'bbox' is missing (e.g., skip)
            else:
                #print("Warning: 'bbox' key not found in annotation. Skipping this annotation.")
                continue  # Skip this annotation if 'bbox' is missing

        # Convert image to tensor
        image_tensor = transforms.ToTensor()(image_resized)

        # Convert the bounding boxes to a tensor
        # Check if bboxes is empty before creating a tensor to avoid errors
        bboxes = torch.tensor(bboxes, dtype=torch.float32) if bboxes else torch.empty((0, 4), dtype=torch.float32)

        if self.transform:
            image_tensor = self.transform(image_tensor)

        return image_tensor, text_mask, bboxes

In [14]:
train_dataset = EASTDataset(dsa)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
class EASTModel(nn.Module):
    def __init__(self):
        super(EASTModel, self).__init__()


        backbone = models.resnet50(pretrained=True)
        self.backbone = nn.Sequential(*list(backbone.children())[:-2])  # Remove avg pool and fc
        
        # Feature dimensions from ResNet50 will be 2048x7x7 for 224x224 input images
        
        # Upsampling layers to restore resolution (if needed)
        # Add upsampling layers here if you need larger feature maps
        
        # Text score map output
        self.text_score = nn.Conv2d(2048, 1, kernel_size=1)
        
        # Bounding box regression output (4 coordinates for each pixel)
        self.bbox_reg = nn.Conv2d(2048, 4, kernel_size=1)
        
        # Angle regression output (for text rotation)
        self.angle_map = nn.Conv2d(2048, 1, kernel_size=1)

    def forward(self, x):
        # Extract features from the backbone
        features = self.backbone(x)  # This will be a 4D tensor [batch, 2048, H, W]
        
        # Predict text score map, bounding boxes, and angle
        text_score = self.text_score(features)
        bbox_reg = self.bbox_reg(features)
        angle_map = self.angle_map(features)
        
        return text_score, bbox_reg, angle_map

In [31]:
def east_loss(pred_text_score, pred_bbox, pred_angle, true_text_score, true_bbox, true_angle):
    # Text score loss (binary cross-entropy)
    text_loss = F.binary_cross_entropy_with_logits(pred_text_score, true_text_score)

    # Bounding box regression loss (smooth L1 loss)
    bbox_loss = F.smooth_l1_loss(pred_bbox, true_bbox)

    # Angle regression loss (mean squared error)
    angle_loss = F.mse_loss(pred_angle, true_angle)

    # Total loss
    total_loss = text_loss + bbox_loss + angle_loss

    return total_loss

In [37]:
import torch._dynamo
torch._dynamo.config.debug = False
torch._dynamo.disable()
model = EASTModel()
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, text_masks, bboxes in train_loader:
        images, text_masks, bboxes = images.cuda(), text_masks.cuda(), bboxes.cuda()
        # Forward pass
        pred_text_score, pred_bbox, pred_angle = model(images)
        # Calculate the loss (using zero angle targets as a placeholder)
        loss = east_loss(pred_text_score, pred_bbox, pred_angle, text_masks, bboxes, torch.zeros_like(bboxes))
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")


AttributeError: module 'torch._functorch.eager_transforms' has no attribute 'grad_and_value'

In [36]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.6.0+cu118
CUDA available: True
CUDA version: 11.8


In [9]:

if torch.cuda.is_available():
    print("CUDA is available!")
    device = torch.device("cuda")  # Get the CUDA device
    # ... your code to move your model and data to the GPU ...
    model = EASTModel().to(device) # or model = EASTModel().cuda()
    # input_tensor = input_tensor.to(device) # or input_tensor = input_tensor.cuda()
else:
    print("CUDA is not available.")

CUDA is available!


NameError: name 'EASTModel' is not defined