In [3]:
import torch
import import_ipynb
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor, Normalize, Compose, Resize, RandomHorizontalFlip,RandomResizedCrop
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision.datasets import CocoDetection
import coco_dataset as Coco
import matplotlib.pyplot as plt
import torch.optim as optim



In [4]:


# Define a simple CNN model for feature extraction
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(128 * 16 * 16, 256)  # Adjust input size based on image dimensions
        self.fc2 = nn.Linear(256, 7)  # output size will be adjusted dynamically
        
        
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Adjust input size based on image dimensions
        
        # Dynamically adjust the input size of the linear layer
        if self.fc1.in_features != x.size(1):
            self.fc1 = nn.Linear(x.size(1), 256).to(x.device)
            
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.view(x.size(0), -1, 7) # Adjust
        return x

# Define the transform to be applied to the images
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((256, 256)),
    transforms.Normalize((0.485, 0.456, 0.406), 
                         (0.229, 0.224, 0.225)),
    
    
    # Add more transformations as needed
])

# Initialize the model
model = SimpleCNN()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

coco_loader = Coco.CocoDatasetLoader(coco_root_train="./coco_dataset_test/train2017", #print the real image and there normalize image and the
                                coco_root_val="./coco_dataset_test/val2017",

                                transform=transforms.Compose([
                                    RandomHorizontalFlip(),
                                    transforms.Compose([
                                    Resize([256,256]),
                                    RandomResizedCrop(265, scale=(0.2, 1.0)),
                                    ]),
                                    ToTensor(),
                                    Normalize(mean=[0.485, 0.456, 0.406],
                                                std=[0.229, 0.224, 0.225])
                                    
                                ])
)


dataloader_train, dataloader_val = coco_loader.get_data_loaders()



for batch in dataloader_train:
    images, targets = batch
    images = images.to(device)
    targets =[ target.to(device) for target in targets]
    images = list(img.to(device) for img in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    print(targets)
    break 
    #print(images.shape, [ target.to(device) for target in targets])
    
     # Get color images (assuming images are in RGB format)
    #color_images = [F.to_pil_image(image.cpu()) for image in images]

    # Display color images
    #for i, color_image in enumerate(color_images):
       # plt.imshow(color_image)
       # plt.show()

    # Access tensors (boxes, labels, image_ids, ids) for further processing
    #for target in targets:
        # Access target data, e.g., boxes, labels, etc.
       # boxes = target[:, :4]
        #labels = target[:, 4]
        #image_ids = target[:, 5]
        #ids = target[:, 6]
        #print(boxes, labels, image_ids, ids)


# Train the model for a few epochs (this is a simplified training loop)
num_epochs = 10
for epoch in range(num_epochs):
    total_samples = 0
    correct_samples = 0
    for images, targets in dataloader_train:
        images = images.to(device)
        targets = targets.to(device)
        num_boxes = targets.size(1)
        model.fc2 = nn.Linear(256, num_boxes*7).to(device)
                
        # Re-initialize the optimizer
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        # Calculate accuracy
        predictions = torch.sigmoid(outputs) > 0.5
        correct_samples += (predictions == targets).sum().item()
        total_samples += targets.numel()
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    accuracy = 100.0 * correct_samples / total_samples
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}, Accuracy: {accuracy:.3f}%")
    

# Feature extraction for all images in dataloader_train
with torch.no_grad():
    all_extracted_features = []

    for batch in dataloader_train:
        images, _ = batch
        images = images.to(device)
        
        # Assuming 'model' is your object detection model
        extracted_features = model(images)
        all_extracted_features.append(extracted_features)

    # Concatenate features from all batches
    all_extracted_features = torch.cat(all_extracted_features, dim=0)

print("Shape of all extracted features:", all_extracted_features.shape)



loading annotations into memory...
Done (t=19.24s)
creating index...
index created!
loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
Epoch [1/10], Loss: -8021975687.506311, Accuracy: 38.382%
Epoch [2/10], Loss: -133684.18159012363, Accuracy: 38.395%
Epoch [3/10], Loss: -49835566.26440437, Accuracy: 38.421%
Epoch [4/10], Loss: -77607375036.37642, Accuracy: 38.432%
Epoch [5/10], Loss: -7195659.517564424, Accuracy: 38.418%
Epoch [6/10], Loss: 1440396015313.661, Accuracy: 38.462%
Epoch [7/10], Loss: 8762541324345.966, Accuracy: 38.469%
Epoch [8/10], Loss: 316432090.3478167, Accuracy: 38.363%
Epoch [9/10], Loss: -673387216.5755936, Accuracy: 38.395%
Epoch [10/10], Loss: -19081376944420.875, Accuracy: 38.476%
Shape of all extracted features: torch.Size([118287, 23, 7])
