In [5]:
import os
import xml.etree.ElementTree as ET
import shutil
import glob
import random
from sklearn.model_selection import train_test_split
import pandas as pd


In [3]:
# Directories
# TODO : make a validation set here as well

# unzip downloaded data from kaggle link into fruit_dataset folder
train_dir = r'C:\Users\praka\Acads\M_Tech\bits\Sem 3\CV\asmt_labs\asmt2\fruit_dataset\train_zip\train'
test_dir = r'C:\Users\praka\Acads\M_Tech\bits\Sem 3\CV\asmt_labs\asmt2\fruit_dataset\test_zip\test'


In [8]:
def extract_annotations(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    boxes = []
    filename = root.find('filename').text
    width = int(root.find('size/width').text)
    height = int(root.find('size/height').text)

    for obj in root.findall('object'):
        label = obj.find('name').text
        xmin = int(obj.find('bndbox/xmin').text)
        ymin = int(obj.find('bndbox/ymin').text)
        xmax = int(obj.find('bndbox/xmax').text)
        ymax = int(obj.find('bndbox/ymax').text)

        boxes.append([filename, width, height, label, xmin, ymin, xmax, ymax])

    return boxes

# Function to read all annotations from a directory
def parse_annotations_from_directory(directory):
    annotations = []
    for file in os.listdir(directory):
        if file.endswith(".xml"):
            xml_file = os.path.join(directory, file)
            annotations.extend(extract_annotations(xml_file))
    return annotations

train_annotations = parse_annotations_from_directory(train_dir)
test_annotations = parse_annotations_from_directory(test_dir)

# Convert to pandas DataFrame for easy manipulation
columns = ['filename', 'width', 'height', 'label', 'xmin', 'ymin', 'xmax', 'ymax']
train_df = pd.DataFrame(train_annotations, columns=columns)
test_df = pd.DataFrame(test_annotations, columns=columns)

In [16]:
train_df.head()

Unnamed: 0,filename,width,height,label,xmin,ymin,xmax,ymax
0,apple_1.jpg,0,0,apple,8,15,331,349
1,apple_10.jpg,1500,1500,apple,56,99,1413,1419
2,apple_11.jpg,652,436,apple,213,33,459,258
3,apple_11.jpg,652,436,apple,1,30,188,280
4,apple_11.jpg,652,436,apple,116,5,337,220


In [11]:
train_df.shape, test_df.shape

((465, 8), (117, 8))

In [13]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.transforms import functional as F

model = fasterrcnn_resnet50_fpn(pretrained=True, weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
print(model)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [14]:
num_classes = 5  # Assuming background + 1 fruit class (apple, banana, orange, mixed)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torch.nn.Linear(in_features, num_classes)

In [17]:
import os
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

class FruitDataset(Dataset):
    def __init__(self, df, img_dir, transforms=None):
        self.df = df
        self.img_dir = img_dir
        self.transforms = transforms
        # Create a mapping of class names to integers
        self.class_to_index = {'apple': 1, 'banana': 2, 'orange': 3, 'mixed': 4}

    def __getitem__(self, idx):
        # Get the image file
        img_name = self.df.iloc[idx]['filename']
        img_path = os.path.join(self.img_dir, img_name)
        img = Image.open(img_path).convert("RGB")

        # Get bounding boxes
        xmin = self.df.iloc[idx]['xmin']
        ymin = self.df.iloc[idx]['ymin']
        xmax = self.df.iloc[idx]['xmax']
        ymax = self.df.iloc[idx]['ymax']
        boxes = [[xmin, ymin, xmax, ymax]]
        boxes = torch.as_tensor(boxes, dtype=torch.float32)

        # Get labels
        class_name = self.df.iloc[idx]['label']  # 'label' column contains the fruit names
        label = self.class_to_index.get(class_name, 0)  # Default to 0 if class_name not found
        labels = torch.as_tensor([label], dtype=torch.int64)

        # Create target dict
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels

        # Apply transformations
        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.df)

# Image transformations
transforms = T.Compose([
    T.ToTensor()
])

# Initialize Dataset and DataLoader
train_dataset = FruitDataset(train_df, train_dir, transforms=transforms)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Making model available for training:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

False