In [1]:
import os
pwd = os.getcwd()
print(pwd)

C:\Users\sreek\Downloads\ClearQuote\Assessment_2


In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
import os
import json
import requests
from PIL import Image
import pandas as pd
from io import BytesIO

class OdometerDataset:
    def __init__(self, root_folder, output_folder, transform=None):
        self.root_folder = root_folder
        self.output_folder = output_folder
        self.transform = transform
        self.data = self.load_annotations()

    def load_annotations(self):
        data = []  # Initialize an empty list to store image info

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for sub_folder in os.listdir(self.root_folder):
            sub_folder_path = os.path.join(self.root_folder, sub_folder)
            if not os.path.isdir(sub_folder_path):
                continue

            annotation_file = os.path.join(sub_folder_path, "via_region_data.json")
            if not os.path.exists(annotation_file):
                print(f"Annotation file missing in {sub_folder_path}")
                continue

            with open(annotation_file, 'r') as f:
                annotations = json.load(f)

            for item in annotations.values():
                if 'filename' in item and 'regions' in item:
                    image_path = item['filename']
                    image_path = os.path.join(sub_folder_path, image_path) if not self.is_url(image_path) else image_path
                    
                    if not self.is_url(image_path) and not os.path.exists(image_path):
                        print(f"Skipping {image_path}: File not found or invalid.")
                        continue
                    
                    # Process each "odometer" region
                    for region in item['regions']:
                        if region['region_attributes']['identity'] == 'odometer':
                            # Get the odometer reading
                            odometer_reading = region['region_attributes'].get('reading', 'N/A')
                            
                            # Add the image info to the data list (excluding unnecessary info)
                            image_name = os.path.basename(image_path)
                            data.append({
                                'image_name': image_name,
                                'image_path': image_path,
                                'label': odometer_reading
                            })

        # Convert the list of dictionaries into a DataFrame
        df = pd.DataFrame(data)

        return df

    def is_url(self, path):
        return path.startswith("http://") or path.startswith("https://")

# Usage example
root_folder = pwd + "\\CQ"  # Set this to your root folder path
output_folder = pwd + "\\CQ_OUT"  # Set this to your output folder path
dataset = OdometerDataset(root_folder, output_folder)

# The `dataset.data` will now be a DataFrame with columns: image_name, image_path, label
df = dataset.data
print(df)


Empty DataFrame
Columns: []
Index: []


In [5]:
import os
import json
import requests
from PIL import Image
from io import BytesIO

class YOLOOdometerDataset:
    def __init__(self, root_folder, output_folder, train_ratio=0.8):
        self.root_folder = root_folder
        self.output_folder = output_folder
        self.train_ratio = train_ratio
        self.classes = {"odometer": 0}  # Class mapping for YOLO
        self.prepare_output_folders()
        self.data = self.load_annotations()

    def prepare_output_folders(self):
        # Create YOLO folder structure
        for split in ['train', 'val']:
            image_dir = os.path.join(self.output_folder, 'images', split)
            label_dir = os.path.join(self.output_folder, 'labels', split)
            os.makedirs(image_dir, exist_ok=True)
            os.makedirs(label_dir, exist_ok=True)

    def load_annotations(self):
        data = []
        all_images = []

        for sub_folder in os.listdir(self.root_folder):
            sub_folder_path = os.path.join(self.root_folder, sub_folder)
            if not os.path.isdir(sub_folder_path):
                continue

            annotation_file = os.path.join(sub_folder_path, "via_region_data.json")
            if not os.path.exists(annotation_file):
                print(f"Annotation file missing in {sub_folder_path}")
                continue

            with open(annotation_file, 'r') as f:
                annotations = json.load(f)

            for item in annotations.values():
                if 'filename' in item and 'regions' in item:
                    image_path = item['filename']
                    image = self.load_image(image_path, sub_folder_path)
                    if image is None:
                        print(f"Skipping {image_path}: File not found or invalid.")
                        continue
                    
                    width, height = image.size

                    label_content = []
                    for region in item['regions']:
                        if region['region_attributes'].get('identity') == 'odometer':
                            shape = region['shape_attributes']
                            points = list(zip(shape['all_points_x'], shape['all_points_y']))
                            x_min, y_min = min([point[0] for point in points]), min([point[1] for point in points])
                            x_max, y_max = max([point[0] for point in points]), max([point[1] for point in points])

                            # Normalize YOLO coordinates
                            x_center = ((x_min + x_max) / 2) / width
                            y_center = ((y_min + y_max) / 2) / height
                            bbox_width = (x_max - x_min) / width
                            bbox_height = (y_max - y_min) / height

                            label_content.append(f"{self.classes['odometer']} {x_center} {y_center} {bbox_width} {bbox_height}")

                    # Store image path and label content
                    all_images.append((image, label_content, os.path.basename(image_path)))

        # Split data into train and val
        split_idx = int(len(all_images) * self.train_ratio)
        train_data = all_images[:split_idx]
        val_data = all_images[split_idx:]

        self.save_split(train_data, 'train')
        self.save_split(val_data, 'val')

    def load_image(self, image_path, sub_folder_path):
        """Loads an image from a local path or URL."""
        if self.is_url(image_path):
            try:
                response = requests.get(image_path, timeout=10)
                response.raise_for_status()
                return Image.open(BytesIO(response.content)).convert("RGB")
            except Exception as e:
                print(f"Error downloading {image_path}: {e}")
                return None
        else:
            full_path = os.path.join(sub_folder_path, image_path)
            if os.path.exists(full_path):
                return Image.open(full_path).convert("RGB")
            else:
                return None

    def is_url(self, path):
        """Checks if a given path is a URL."""
        return path.startswith("http://") or path.startswith("https://")

    def save_split(self, data_split, split_name):
        for image, label_content, filename in data_split:
            # Save image
            image_output_path = os.path.join(self.output_folder, 'images', split_name, filename)
            image.save(image_output_path)

            # Save label
            label_output_path = os.path.join(self.output_folder, 'labels', split_name, os.path.splitext(filename)[0] + '.txt')
            with open(label_output_path, 'w') as f:
                f.write("\n".join(label_content))

# Usage example
root_folder = pwd + "\\train"
output_folder = pwd + "\\yolo_custom_stage1_out"
dataset = YOLOOdometerDataset(root_folder, output_folder)


In [23]:
"""python detect.py --weights runs/train/exp3/weights/best.pt --img 1024 --conf-thres 0.5 --iou-thres 0.4 --source ../CQ_CustomTest/ --save-txt"""
#Run from yolov5 directory

'python detect.py --weights runs/train/exp3/weights/best.pt --img 1024 --conf-thres 0.5 --iou-thres 0.4 --source ../CQ_CustomTest/'

In [29]:
import os

def read_detection_results(results_folder):
    # Get all .txt files in the 'labels' folder
    labels_folder = os.path.join(results_folder, "labels")
    detection_results = {}
    
    for file_name in os.listdir(labels_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(labels_folder, file_name)
            with open(file_path, 'r') as file:
                detections = file.readlines()
                detection_results[file_name] = detections

    return detection_results

# Example usage
results_folder = pwd + "\\yolov5\\runs\\detect\\exp15"  # Path to the experiment folder
detection_results = read_detection_results(results_folder)

# Print results for each image
for image_name, detections in detection_results.items():
    print(f"Results for {image_name}:")
    for detection in detections:
        print(detection.strip())


Results for scraped_13mhov_1654867333607.txt:
0 0.269141 0.629167 0.0710938 0.0583333
Results for scraped_13OV6N_1654867264673.txt:
0 0.517578 0.631944 0.0460938 0.0305556
Results for scraped_18a89A_1654867429718.txt:
0 0.499609 0.739583 0.0320312 0.0347222
Results for scraped_1ck86k_1654867403579.txt:
0 0.534375 0.64375 0.05 0.0291667
Results for scraped_1G51IL_1654867270622.txt:
0 0.539062 0.632639 0.0453125 0.0319444
Results for scraped_1h60eA_1654867273398.txt:
0 0.55625 0.803472 0.0875 0.0597222
Results for scraped_1NdmjE_1654867303557.txt:
0 0.496094 0.672917 0.0875 0.0458333
Results for scraped_293TcY_1654867359772.txt:
0 0.553906 0.622222 0.0578125 0.0361111
Results for scraped_2gZeT0_1654867296441.txt:
0 0.821094 0.540278 0.0703125 0.0472222
Results for scraped_2RvgJ9_1654867423025.txt:
0 0.567969 0.488194 0.05 0.0319444
Results for scraped_2tEMhE_1654867176535.txt:
0 0.786328 0.580556 0.0757812 0.0444444
Results for scraped_2VVKar_1654867261897.txt:
0 0.301953 0.58125 0.08046

In [31]:
import os

def read_and_filter_detections(results_folder):
    labels_folder = os.path.join(results_folder, "labels")
    filtered_results = {}

    # Process each detection result file
    for file_name in os.listdir(labels_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(labels_folder, file_name)
            with open(file_path, 'r') as file:
                detections = file.readlines()
                
                # Parse detections and sort by confidence (last element in each line)
                detections = [line.strip().split() for line in detections]
                detections = sorted(detections, key=lambda x: float(x[-1]), reverse=True)

                # Keep only the detection with the highest confidence
                best_detection = detections[0] if detections else None

                if best_detection:
                    filtered_results[file_name] = best_detection

    return filtered_results

# Example usage
results_folder = pwd + "\\yolov5\\runs\\detect\\exp15"
filtered_results = read_and_filter_detections(results_folder)

# Print filtered results
for image_name, detection in filtered_results.items():
    print(f"Filtered result for {image_name}: {detection}")
print(len(filtered_results))

Filtered result for scraped_13mhov_1654867333607.txt: ['0', '0.269141', '0.629167', '0.0710938', '0.0583333']
Filtered result for scraped_13OV6N_1654867264673.txt: ['0', '0.517578', '0.631944', '0.0460938', '0.0305556']
Filtered result for scraped_18a89A_1654867429718.txt: ['0', '0.499609', '0.739583', '0.0320312', '0.0347222']
Filtered result for scraped_1ck86k_1654867403579.txt: ['0', '0.534375', '0.64375', '0.05', '0.0291667']
Filtered result for scraped_1G51IL_1654867270622.txt: ['0', '0.539062', '0.632639', '0.0453125', '0.0319444']
Filtered result for scraped_1h60eA_1654867273398.txt: ['0', '0.55625', '0.803472', '0.0875', '0.0597222']
Filtered result for scraped_1NdmjE_1654867303557.txt: ['0', '0.496094', '0.672917', '0.0875', '0.0458333']
Filtered result for scraped_293TcY_1654867359772.txt: ['0', '0.553906', '0.622222', '0.0578125', '0.0361111']
Filtered result for scraped_2gZeT0_1654867296441.txt: ['0', '0.821094', '0.540278', '0.0703125', '0.0472222']
Filtered result for scr

In [33]:
import os
from PIL import Image

def crop_odometer_regions(results_folder, images_folder, output_folder):
    labels_folder = os.path.join(results_folder, "labels")
    filtered_results = {}

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Process each detection result file
    for file_name in os.listdir(labels_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(labels_folder, file_name)
            image_name = file_name.replace(".txt", ".jpg")  # Assuming the image format is .jpg
            image_path = os.path.join(images_folder, image_name)

            # Check if the image exists
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                continue

            # Load the image
            image = Image.open(image_path)
            image_width, image_height = image.size

            # Read and filter detections (get the highest confidence detection)
            with open(file_path, 'r') as file:
                detections = file.readlines()
                detections = [line.strip().split() for line in detections]
                detections = sorted(detections, key=lambda x: float(x[-1]), reverse=True)
                best_detection = detections[0] if detections else None

            if best_detection:
                # Get bounding box coordinates
                x_center, y_center, width, height = map(float, best_detection[1:5])
                
                # Convert normalized coordinates to actual image coordinates
                x_min = int((x_center - width / 2) * image_width)
                y_min = int((y_center - height / 2) * image_height)
                x_max = int((x_center + width / 2) * image_width)
                y_max = int((y_center + height / 2) * image_height)

                # Crop the image to the odometer region
                cropped_image = image.crop((x_min, y_min, x_max, y_max))

                # Save or process the cropped image
                output_image_path = os.path.join(output_folder, f"cropped_{image_name}")
                cropped_image.save(output_image_path)
                print(f"Cropped image saved: {output_image_path}")

# Example usage
results_folder = pwd + "\\yolov5\\runs\\detect\\exp15"  # Path to the experiment folder
images_folder = pwd + "\\CQ_CustomTest"  # Path to your test images
output_folder = pwd + "\\CQ_CustomTest_cropped"  # Folder where cropped images will be saved

crop_odometer_regions(results_folder, images_folder, output_folder)


Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_13mhov_1654867333607.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_13OV6N_1654867264673.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_18a89A_1654867429718.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_1ck86k_1654867403579.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_1G51IL_1654867270622.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_1h60eA_1654867273398.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2\CQ_CustomTest_cropped\cropped_scraped_1NdmjE_1654867303557.jpg
Cropped image saved: C:\Users\sreek\Downloads\ClearQuote\Assessment_2

In [None]:
#Stage 2

In [13]:
import os
import json
import requests
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO

class OdometerDataset:
    def __init__(self, root_folder, output_folder, transform=None):
        self.root_folder = root_folder
        self.output_folder = output_folder
        self.transform = transform
        self.data = self.load_annotations()

    def load_annotations(self):
        data = []
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)
        
        for sub_folder in os.listdir(self.root_folder):
            sub_folder_path = os.path.join(self.root_folder, sub_folder)
            if not os.path.isdir(sub_folder_path):
                continue

            annotation_file = os.path.join(sub_folder_path, "via_region_data.json")
            if not os.path.exists(annotation_file):
                print(f"Annotation file missing in {sub_folder_path}")
                continue

            with open(annotation_file, 'r') as f:
                annotations = json.load(f)

            for item in annotations.values():
                if 'filename' in item and 'regions' in item:
                    image_path = item['filename']
                    image_path = os.path.join(sub_folder_path, image_path) if not self.is_url(image_path) else image_path
                    
                    if not self.is_url(image_path) and not os.path.exists(image_path):
                        print(f"Skipping {image_path}: File not found or invalid.")
                        continue
                    
                    # Process each "odometer" region
                    for region in item['regions']:
                        if region['region_attributes']['identity'] == 'odometer':
                            shape = region['shape_attributes']
                            points = list(zip(shape['all_points_x'], shape['all_points_y']))
                            x_min, y_min = min([point[0] for point in points]), min([point[1] for point in points])
                            x_max, y_max = max([point[0] for point in points]), max([point[1] for point in points])
                            
                            # Load image, handle both local file and URL
                            if self.is_url(image_path):
                                response = requests.get(image_path)
                                image = Image.open(BytesIO(response.content)).convert("RGB")
                            else:
                                image = Image.open(image_path).convert("RGB")
                            
                            # Crop the image
                            cropped_image = image.crop((x_min, y_min, x_max, y_max))
                            
                            # Get the "odometer" reading
                            odometer_reading = region['region_attributes'].get('reading', 'N/A')

                            # Save cropped image with the original filename in the output folder
                            output_image_path = os.path.join(self.output_folder, os.path.basename(image_path))
                            cropped_image.save(output_image_path)
                            
                            # Append data for future use (e.g., during training)
                            data.append((output_image_path, odometer_reading))

                            # If you want to display images
                            # plt.figure(figsize=(5, 5))
                            # plt.imshow(cropped_image)
                            # plt.axis('off')
                            # plt.title(f"Odometer Reading: {odometer_reading}")
                            # plt.show()

        return data

    def is_url(self, path):
        return path.startswith("http://") or path.startswith("https://")

# Usage example
root_folder = pwd + "\\train"  # Set this to your root folder path
output_folder = pwd + "\\cropped_train"  # Set this to your output folder path
dataset = OdometerDataset(root_folder, output_folder)

# The `dataset.data` will have a list of tuples: (image_path, odometer_reading)


In [14]:
print(len(dataset.data))
print(dataset.data[:10])
dataset = dataset.data

3659
[('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_BvkovI_1654858778932.JPG', '39747'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_Z6oiQq_1654866593617.jpg', '010676'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_pvL8di_1654866596100.jpg', '75066'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_f7NYF2_1654866598642.jpg', '221498'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_B362xa_1654866600042.jpg', '76777'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_njD0k2_1654866602597.jpg', '091308'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_HdwE8p_1654866604942.jpg', '20818'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_2\\cropped_train\\scraped_v8Qx45_1654866607265.jpg', '114318'), ('C:\\Users\\sreek\\Downloads\\ClearQuote\\Assessment_

In [15]:
import csv

# Specify the path for the output CSV file
output_csv = "trimmed_train_labels.csv"

# Write the dataset to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["path", "label"])  # Write the header
    writer.writerows(dataset)          # Write the dataset rows

print(f"CSV file saved at {output_csv}")


CSV file saved at trimmed_train_labels.csv


In [8]:
#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
from huggingface_hub import login

login("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")


In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
from torchvision import transforms

# Load CSV file
df = pd.read_csv("train_labels.csv")

# Dataset Class
class OdometerDataset(Dataset):
    def __init__(self, dataframe, processor, transform=None):
        self.dataframe = dataframe
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        label = str(self.dataframe.iloc[idx, 1])  # Ensure label is a string (for OCR)
        
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        # Process the image and label
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(label, padding="max_length", truncation=True, max_length=16, return_tensors="pt").input_ids.squeeze(0)
        
        return {
            'pixel_values': pixel_values,
            'labels': labels
        }

# Transformation for training images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust this size based on the model input
    transforms.ToTensor(),
])

# Initialize Processor and Model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")

# Set the required configuration values
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id


# Prepare Dataset and DataLoader
train_dataset = OdometerDataset(dataframe=df, processor=processor, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Set up the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 5

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

              
        # Now forward pass
        outputs = model(pixel_values=pixel_values, labels=labels)



        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()


    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Save the trained model
model.save_pretrained("trained_odometer_model")
processor.save_pretrained("trained_odometer_processor")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.14 GiB is allocated by PyTorch, and 544.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [10]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import pandas as pd
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler

# Load CSV file
df = pd.read_csv("train_labels.csv")

# Dataset Class
class OdometerDataset(Dataset):
    def __init__(self, dataframe, processor, transform=None):
        self.dataframe = dataframe
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        label = str(self.dataframe.iloc[idx, 1])  # Ensure label is a string (for OCR)

        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        # Process the image and label
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(label, padding="max_length", truncation=True, max_length=16, return_tensors="pt").input_ids.squeeze(0)

        return {
            'pixel_values': pixel_values,
            'labels': labels
        }

# Transformation for training images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust this size based on the model input
    transforms.ToTensor(),
])

# Initialize Processor and Model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")

# Set the required configuration values
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id

# Prepare Dataset and DataLoader
train_dataset = OdometerDataset(dataframe=df, processor=processor, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Start with a smaller batch size

# Set up the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and Mixed Precision Training
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()  # Gradient scaler for mixed precision training

epochs = 5
accumulation_steps = 4  # Accumulate gradients over multiple steps

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Mixed precision training
        with autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            loss = loss / accumulation_steps  # Normalize loss for gradient accumulation

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Save the trained model
model.save_pretrained("trained_odometer_model")
processor.save_pretrained("trained_odometer_processor")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.29 GiB is allocated by PyTorch, and 383.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
#TEsting

In [16]:
import pandas as pd
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Load TrOCR model and processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")

# Function to perform OCR on an image
def detect_text(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        detected_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return detected_text
    except Exception as e:
        return f"Error: {e}"

# Load the .csv file
csv_file = pwd + "\\trimmed_train_labels.csv"  # Replace with your .csv file path
data = pd.read_csv(csv_file)

# Create a DataFrame with an additional column for detected text
results = []
for _, row in data.iterrows():
    file_path = row["path"]
    label = row["label"]
    detected_text = detect_text(file_path)
    results.append({"name_of_file": file_path, "text_detected": detected_text, "original_label": label})

df = pd.DataFrame(results)

# Save the results to a new CSV file
output_csv = pwd + "\\ocr_results.csv"
df.to_csv(output_csv, index=False)

print(f"OCR results saved to {output_csv}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

OCR results saved to C:\Users\sreek\Downloads\ClearQuote\Assessment_2\ocr_results.csv
