<a href="https://colab.research.google.com/github/sgwlee96/Fall2024/blob/main/DATA_255_Fall24_Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 1: Deep Learning-Based Recommendation (10 Points)

Read the paper Wide and Deep Learning for Recommender Systems.
Download the files anime-dataset-2023.csv, users-details-2023.csv, users-score- 2023.csv
from the following link: https://www.kaggle.com/datasets/dbdmobile/myanimelist-dataset
Based on the architecture described in the paper, build your own Wide and Deep
Recommender system for the Anime Dataset. Your model should learn the features of each
user and anime, not just the associated ID numbers. Utilize an 80/20 train-test split and record
your model’s prediction accuracy.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


anime = None
users = None
scores = None

parent_dir = '/content/drive/Shareddrives/DATA 255 Lab Group 22/dataset/part1/'

if anime is None or users is None or scores is None:
# Load datasets
  anime = pd.read_csv(parent_dir + "anime-dataset-2023.csv")
  users = pd.read_csv(parent_dir + "users-details-2023.csv")
  scores = pd.read_csv(parent_dir + "users-score-2023.csv")
else:
  print("Datasets already loaded")

In [4]:
# anime

In [5]:
# users

In [6]:
# scores

In [7]:
# Merging datasets to get a full interaction table
data = pd.merge(pd.merge(scores, users[['Username', 'Mean Score', 'Location']], on='Username'),
                anime[['anime_id', 'Score', 'Genres', 'Type', 'Episodes']], on='anime_id')



In [8]:
# Feature Engineering - handling categorical features
data['user_id'] = data['user_id'].astype('category').cat.codes
data['anime_id'] = data['anime_id'].astype('category').cat.codes
data['Genres'] = data['Genres'].astype('category').cat.codes
data['Type'] = data['Type'].astype('category').cat.codes
data['Location'] = data['Location'].astype('category').cat.codes

# Handle missing values
data.fillna(0, inplace=True)

# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



In [9]:
# Ensure that 'Score' column is numeric, and handle non-numeric values by filling with a default (e.g., 0.0)
data['Score'] = pd.to_numeric(data['Score'], errors='coerce').fillna(0.0)

# Handle any other columns that might have non-numeric values
data['Mean Score'] = pd.to_numeric(data['Mean Score'], errors='coerce').fillna(0.0)

# Split train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Dataset class
class AnimeDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.anime = torch.tensor(df['anime_id'].values, dtype=torch.long)
        self.scores = torch.tensor(df['rating'].values, dtype=torch.float32)
        self.mean_score = torch.tensor(df['Mean Score'].values, dtype=torch.float32)
        self.anime_score = torch.tensor(df['Score'].values, dtype=torch.float32)
        self.genres = torch.tensor(df['Genres'].values, dtype=torch.long)
        self.type = torch.tensor(df['Type'].values, dtype=torch.long)

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        return (self.users[idx], self.anime[idx], self.scores[idx],
                self.mean_score[idx], self.anime_score[idx], self.genres[idx], self.type[idx])

# DataLoader
train_dataset = AnimeDataset(train_data)
test_dataset = AnimeDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [10]:
# Model definition
class WideAndDeep(nn.Module):
    def __init__(self, num_users, num_anime, num_genres, num_types, embedding_dim):
        super(WideAndDeep, self).__init__()

        # Wide component (linear)
        # The input features include user_id, anime_id, genres, type, mean_score, anime_score
        # So we need 6 input features in the wide component.
        self.wide = nn.Linear(6, 1)

        # Deep component (embeddings for categorical features)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.anime_embedding = nn.Embedding(num_anime, embedding_dim)
        self.genre_embedding = nn.Embedding(num_genres, embedding_dim)
        self.type_embedding = nn.Embedding(num_types, embedding_dim)

        # Deep neural network layers
        self.deep = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 2, 128),  # 4 embeddings + 2 scalar features (mean_score, anime_score)
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user_id, anime_id, mean_score, anime_score, genres, type):
        # Wide component
        wide_input = torch.cat([
            user_id.float().unsqueeze(1),  # Unsqueeze to make it 2D
            anime_id.float().unsqueeze(1),  # Unsqueeze to make it 2D
            mean_score.unsqueeze(1),  # Already 1D, needs unsqueeze for 2D
            anime_score.unsqueeze(1),  # Already 1D, needs unsqueeze for 2D
            genres.float().unsqueeze(1),  # Unsqueeze to make it 2D
            type.float().unsqueeze(1)  # Unsqueeze to make it 2D
        ], dim=1)

        # Ensure that the input dimension is correct for the wide layer
        wide_output = self.wide(wide_input)

        # Deep component
        user_embed = self.user_embedding(user_id)
        anime_embed = self.anime_embedding(anime_id)
        genre_embed = self.genre_embedding(genres)
        type_embed = self.type_embedding(type)

        # Concatenate embeddings with continuous features
        deep_input = torch.cat([user_embed, anime_embed, genre_embed, type_embed,
                                mean_score.unsqueeze(1), anime_score.unsqueeze(1)], dim=1)
        deep_output = self.deep(deep_input)

        # Combine wide and deep outputs
        out = wide_output + deep_output
        return out.squeeze()


# Instantiate the model
model = WideAndDeep(num_users=len(data['user_id'].unique()),
                    num_anime=len(data['anime_id'].unique()),
                    num_genres=len(data['Genres'].unique()),
                    num_types=len(data['Type'].unique()),
                    embedding_dim=10)

# Device configuration (CPU or GPU)
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for user_id, anime_id, score, mean_score, anime_score, genres, type in train_loader:
            user_id, anime_id, score, mean_score, anime_score, genres, type = \
                user_id.to(device), anime_id.to(device), score.to(device), \
                mean_score.to(device), anime_score.to(device), genres.to(device), type.to(device)
            optimizer.zero_grad()
            outputs = model(user_id, anime_id, mean_score, anime_score, genres, type)
            loss = criterion(outputs, score)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Evaluate model
def evaluate_model(model, test_loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for user_id, anime_id, score, mean_score, anime_score, genres, type in test_loader:
            user_id, anime_id, score, mean_score, anime_score, genres, type = \
                user_id.to(device), anime_id.to(device), score.to(device), \
                mean_score.to(device), anime_score.to(device), genres.to(device), type.to(device)
            outputs = model(user_id, anime_id, mean_score, anime_score, genres, type)
            loss = criterion(outputs, score)
            total_loss += loss.item()
    print(f"Test Loss: {total_loss/len(test_loader)}")

# Evaluate the model
evaluate_model(model, test_loader)

Epoch 1, Loss: 1167870.9916431513
Epoch 2, Loss: 29.330061409576796
Epoch 3, Loss: 29.32375280246963
Epoch 4, Loss: 29.000527335439802
Epoch 5, Loss: 28.79231848312474
Test Loss: 4.3003381208942315


#Part 2: Image Classification with Deep Learning (40 Points)

##1. Download the Sports Image Dataset from the given link:
https://www.kaggle.com/datasets/sidharkal/sports-image-classification/data
This dataset consists of labeled images belonging to the following sports classes:
cricket, wrestling, tennis, badminton, soccer, swimming, and karate.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define transforms for data augmentation
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

train_dataset = datasets.ImageFolder(root='sports-image/train', transform=transform)
test_dataset = datasets.ImageFolder(root='sports-image/test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


##2. Explain in your own words: (7 points)



a. Gradient Descent (1 point)


Gradient Descent is an optimization algorithm used to minimize a loss function by iteratively moving in the direction of the steepest descent (i.e., the negative gradient of the loss function). The basic idea is to adjust the model's parameters (e.g., weights and biases) in small steps, based on the gradient of the loss function with respect to these parameters. The process continues until convergence, ideally reaching a point where the loss function is minimized.



b. List 3 regularization techniques and explain (2 points)


L2 Regularization (Ridge): This technique adds the squared sum of all model parameters (weights) to the loss function. The model is penalized for having large weights, which helps prevent overfitting by encouraging smaller weights. The regularization term is scaled by a hyperparameter
𝜆
λ.

Equation:

Loss=Original Loss+λ∑w
2


L1 Regularization (Lasso): In L1 regularization, the absolute value of the weights is added to the loss function. This can lead to sparse solutions where some weights are reduced to exactly zero, which can be helpful for feature selection.

Equation:

Loss=Original Loss+λ∑∣w∣

Dropout: Dropout is a technique where a random subset of neurons is "dropped out" during each forward pass during training. This forces the network to not rely too heavily on any individual neuron, improving generalization and reducing overfitting. Dropout is typically used only during training, and the full network is used during testing.

c. Activation functions (1 point)


Activation functions introduce non-linearity into neural networks, allowing them to model complex data patterns. Common activation functions include:

ReLU (Rectified Linear Unit):
f(x)=max(0,x), where all negative values are set to zero.  
Sigmoid:
f(x)=1/(1+e^(−x))
 , which outputs values between 0 and 1.  
Tanh:
f(x)=tanh(x), which outputs values between -1 and 1.  
Activation functions help neurons "fire" based on their input, enabling deep networks to learn complex patterns.

d. Loss function and Back Propagation (2 points)


Loss Function: A loss function measures how well the model's predictions match the actual target values. Common loss functions include Mean Squared Error (MSE) for regression and Cross-Entropy Loss for classification. The goal is to minimize the loss function during training.

Back Propagation: Back propagation is the process of calculating the gradient of the loss function with respect to each weight in the neural network, using the chain rule. It starts from the output layer and works its way backward through the network, updating the weights to reduce the loss by applying gradient descent. This process is repeated iteratively during training.



e. Epochs, Iterations, and Batch size (2 points)



Epoch: One epoch is a complete pass of the entire training dataset through the model. It indicates how many times the model has seen the full dataset during training.

Iteration: An iteration is a single update of the model’s parameters, usually based on a single batch of data. The number of iterations in one epoch equals the total number of training examples divided by the batch size.

Batch Size: Batch size is the number of training examples processed before updating the model’s parameters. A smaller batch size results in more frequent updates but increases variance in the updates, while a larger batch size reduces variance but can slow down the learning process.

##3. Visualize/summarize the data (12 points)


a.Number of images in the training and testing set and number of classes in the
target variable (1 point)


b.Number of images per class (1 point)


In [None]:
import matplotlib.pyplot as plt

# Count images per class
class_counts = {class_name: len(os.listdir(os.path.join(train_dir, class_name))) for class_name in os.listdir(train_dir)}

# Plot
plt.figure(figsize=(10, 5))
plt.bar(class_counts.keys(), class_counts.values())
plt.title('Number of Images per Class')
plt.xlabel('Classes')
plt.ylabel('Number of Images')
plt.xticks(rotation=45)
plt.show()

c.Number of pixels in the first 2 images of each class (Height and width individually)
(2 points)


In [None]:
from PIL import Image

image_sizes = {}
for class_name in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_name)
    images = os.listdir(class_path)[:2]  # Get the first two images
    image_sizes[class_name] = []

    for image_file in images:
        with Image.open(os.path.join(class_path, image_file)) as img:
            width, height = img.size
            image_sizes[class_name].append((width, height))

print("Image Sizes (Width, Height) for the First 2 Images of Each Class:")
print(image_sizes)

d. Display at least 3 images of each class (3 points)


In [None]:
def display_images(class_name, num_images=3):
    class_path = os.path.join(train_dir, class_name)
    images = os.listdir(class_path)[:num_images]

    plt.figure(figsize=(15, 5))
    for i, image_file in enumerate(images):
        img = Image.open(os.path.join(class_path, image_file))
        plt.subplot(1, num_images, i+1)
        plt.imshow(img)
        plt.title(class_name)
        plt.axis('off')
    plt.show()

for class_name in os.listdir(train_dir):
    display_images(class_name)

e. Apply data augmentation and other image preprocessing and plot the sample of
processed images. (3 points)


In [None]:
from torchvision import transforms

# Define data augmentation and preprocessing
data_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# Visualize augmented images
def visualize_augmentation(class_name):
    class_path = os.path.join(train_dir, class_name)
    images = os.listdir(class_path)[:3]  # Get first 3 images

    plt.figure(figsize=(15, 5))
    for i, image_file in enumerate(images):
        img = Image.open(os.path.join(class_path, image_file))
        img_transformed = data_transforms(img)

        plt.subplot(1, 3, i+1)
        plt.imshow(img_transformed.permute(1, 2, 0))  # Permute for correct color channels
        plt.title(f'Augmented {class_name}')
        plt.axis('off')
    plt.show()

for class_name in os.listdir(train_dir):
    visualize_augmentation(class_name)

##4. Train a neural network (21 points)


a.Decide the number of layers and neurons in each layer (2 points)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(in_features=128 * 128 * 3, out_features=512)  # Flattened input size
        self.fc2 = nn.Linear(in_features=512, out_features=256)
        self.fc3 = nn.Linear(in_features=256, out_features=num_classes)  # Output layer for classes

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

b.Try different number of epochs and batch sizes (2 points)


In [None]:
# Hyperparameters
num_epochs_list = [10, 20, 30]  # Different epochs to try
batch_sizes = [16, 32, 64]  # Different batch sizes to try

c. Try out different activation functions (explain each one you used) (3 points)


In [None]:
# Using ReLU, Sigmoid, and Tanh
class CustomNN(nn.Module):
    def __init__(self):
        super(CustomNN, self).__init__()
        self.fc1 = nn.Linear(128 * 128 * 3, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))  # ReLU activation
        x = torch.sigmoid(self.fc2(x))  # Sigmoid activation
        x = self.fc3(x)  # Output layer
        return x

ReLU (Rectified Linear Unit): Outputs the input directly if positive; otherwise, it outputs zero. This allows the model to learn non-linear patterns and avoids the vanishing gradient problem for positive inputs.
Sigmoid: Outputs values between 0 and 1, making it useful for binary classification. However, it can suffer from the vanishing gradient problem for extreme values.

d.Try at least three different regularizations (3 points)


In [None]:
# L2 Regularization
optimizer = optim.Adam(model.parameters(), weight_decay=0.01)  # L2 regularization

# L1 Regularization (manual implementation)
def l1_loss(model, lambda_l1=0.01):
    l1_norm = sum(p.abs().sum() for p in model.parameters())
    return lambda_l1 * l1_norm

# Dropout (included in the model definition)
self.dropout = nn.Dropout(p=0.5)

e.Try different loss functions (3 points)


In [None]:
# Loss functions to try
loss_fn1 = nn.CrossEntropyLoss()  # Common for multi-class classification
loss_fn2 = nn.BCEWithLogitsLoss()  # For binary classification tasks
loss_fn3 = nn.MSELoss()  # Used for regression tasks

f. Try different optimization algorithms (such as Gradient Descent, Adam, etc.) (4 points)


In [None]:
# Different optimizers
optimizer_adam = optim.Adam(model.parameters(), lr=0.001)
optimizer_sgd = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer_rmsprop = optim.RMSprop(model.parameters(), lr=0.001)

g. Decide your best-performing model based on both time and accuracy. (1
point)


In [None]:
# After training different models and comparing their performance
best_model = 'Adam with L2 Regularization'  # Example
print(f'Best Performing Model: {best_model}')

h. Create a graph of loss vs epochs for training and testing set. (1 point)


In [None]:
# Loss tracking
train_losses = []  # Store training losses
test_losses = []  # Store testing losses

# Training loop (pseudocode)
for epoch in range(num_epochs):
    # Train the model and calculate loss
    train_losses.append(train_loss)
    test_losses.append(test_loss)

# Plotting
plt.plot(range(num_epochs), train_losses, label='Train Loss')
plt.plot(range(num_epochs), test_losses, label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs. Epochs')
plt.legend()
plt.show()

i.Create a graph of f1 score vs epochs for training and testing set. (1 point)


In [None]:
from sklearn.metrics import f1_score

# F1 score tracking
train_f1_scores = []  # Store F1 scores for training set
test_f1_scores = []  # Store F1 scores for testing set

# Calculate F1 scores in the training loop
for epoch in range(num_epochs):
    # Calculate F1 scores
    train_f1_scores.append(train_f1)
    test_f1_scores.append(test_f1)

# Plotting
plt.plot(range(num_epochs), train_f1_scores, label='Train F1 Score')
plt.plot(range(num_epochs), test_f1_scores, label='Test F1 Score')
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('F1 Score vs. Epochs')
plt.legend()
plt.show()

j.Calculate the number of trainable parameters in your final model. (1 point)

In [None]:
# Function to count trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate and print the number of trainable parameters
num_params = count_parameters(best_model)
print(f'Number of Trainable Parameters in Final Model: {num_params}')

#Part 3: Object Detection (50 Points)

1. Explain the differences between object detection, image classification, and image
segmentation. (3 points)


Image Classification: This task involves categorizing an entire image into a single label or class. For example, identifying an image as "dog" or "cat" without any information about the position of the object in the image.

Object Detection: Object detection extends image classification by not only identifying objects within an image but also localizing them with bounding boxes. For instance, in an image with multiple objects, the model will output both the classes and their respective positions.

Image Segmentation: This technique goes a step further by classifying each pixel in the image. In semantic segmentation, each pixel is assigned a class label, while in instance segmentation, each object instance is differentiated. For example, in an image with dogs and cats, each pixel belonging to a dog would be labeled as "dog" and each pixel belonging to a cat would be labeled as "cat."

2. Explain the architectures and differences between R-CNN, Fast R-CNN, and Faster
R-CNN. (4 points)


R-CNN (Regions with CNN features): R-CNN first generates region proposals using a selective search and then uses a CNN to extract features from each region. Each region is classified using a Support Vector Machine (SVM). The main drawback is the slow inference speed due to the need to run the CNN for each region proposal separately.

Fast R-CNN: This improvement allows the CNN to process the entire image to extract features, creating a feature map. Region proposals are then applied to this feature map, significantly reducing the number of CNN evaluations needed. It uses a softmax layer for classification and a bounding box regression layer for refining the region proposals. This makes it faster than R-CNN.

Faster R-CNN: This model introduces a Region Proposal Network (RPN) that shares convolutional features with the detection network, allowing the model to propose regions of interest directly. This integration improves the speed and efficiency of the object detection process compared to both R-CNN and Fast R-CNN.

3. Explain what’s U-net. (5 points)


U-Net is a convolutional neural network architecture primarily used for image segmentation tasks, particularly in biomedical image segmentation. The architecture consists of a contracting path (encoder) and an expanding path (decoder):

Encoder: The encoder captures context through successive convolutional and pooling layers, downsampling the image to learn rich feature representations.

Decoder: The decoder upsamples the feature maps and combines them with corresponding feature maps from the encoder through skip connections. This helps preserve spatial information that might be lost during downsampling.

The U-Net architecture is notable for its symmetrical design and effective handling of small datasets, making it a popular choice for segmentation tasks.

4. List at least 3 widely used metrics in the object detection industry and explain them in
detail. (3 points)


Mean Average Precision (mAP): This metric evaluates the precision and recall of the detection model across multiple Intersection over Union (IoU) thresholds. It calculates the average precision for each class, then takes the mean across all classes. mAP is crucial for assessing how well a model detects objects across different classes.

Intersection over Union (IoU): IoU measures the overlap between the predicted bounding box and the ground truth bounding box. It is defined as the area of intersection divided by the area of union. A higher IoU indicates better localization. It is often used as a threshold to determine if a detection is considered correct.

Precision and Recall:

Precision indicates the accuracy of the positive predictions made by the model, calculated as the ratio of true positive detections to the total positive detections (true positives + false positives).  
Recall measures the model's ability to identify all relevant instances, calculated as the ratio of true positives to the total actual positive instances (true positives + false negatives).

5. Explain what’s Non-Maximum Suppression and how it works. (2 points)


Non-Maximum Suppression (NMS) is a technique used in object detection to eliminate redundant overlapping bounding boxes. After an object detection model generates multiple bounding boxes for a detected object, NMS helps retain only the most confident box.

How it Works:

1. For each detected object, sort the bounding boxes based on their confidence scores.  
2. Select the box with the highest score and eliminate all other boxes that have an IoU above a predefined threshold with the selected box.
3. Repeat the process with the next highest scoring box until all boxes are processed.

6. Download Road Sign Dataset from the following link:
https://drive.google.com/drive/folders/1yvzEtFDqodCUIssXIKMrQ_YtL_0gNHyA



7. Perform necessary data transformation and augmentation steps (1 point)


In [None]:
from torchvision import transforms

# Define data transformations and augmentations
data_transforms = transforms.Compose([
    transforms.Resize((128, 128)),   # Resize images to a standard size
    transforms.RandomHorizontalFlip(), # Random horizontal flip
    transforms.RandomRotation(10),     # Random rotation
    transforms.ToTensor(),             # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # Normalize
])

8. Plot some random images from the train, test, and validation set. (2 points)


In [None]:
import matplotlib.pyplot as plt
import random

# Function to display random images from the dataset
def show_random_images(dataset, num_images=3):
    fig, axs = plt.subplots(1, num_images, figsize=(15, 5))
    for ax in axs:
        idx = random.randint(0, len(dataset) - 1)
        img, label = dataset[idx]
        ax.imshow(img.permute(1, 2, 0).numpy())  # Convert from CxHxW to HxWxC
        ax.set_title(f'Label: {label}')
        ax.axis('off')
    plt.show()

# Display random images from the training set
show_random_images(train_dataset)

9. Pick one model of your choice and implement it from scratch to perform object
detection.


In [None]:
# YOLO, SSD, or Faster R-CNN

In [None]:
# Faster R-CNN
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F

# Load pre-trained model
model = fasterrcnn_resnet50_fpn(pretrained=True)

# Load and preprocess image
image = F.to_tensor(image).unsqueeze(0)
model.eval()
predictions = model(image)

10. Compute the IOU of your results with the test set and print a few predicted images.


In [None]:
# def calculate_iou(boxA, boxB):
#     # Coordinates of intersection rectangle
#     xA = max(boxA[0], boxB[0])
#     yA = max(boxA[1], boxB[1])
#     xB = min(boxA[2], boxB[2])
#     yB = min(boxA[3], boxB[3])

#     # Intersection area
#     interArea = max(0, xB - xA) * max(0, yB - yA)

#     # Union area
#     boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
#     boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
#     unionArea = boxAArea + boxBArea - interArea

#     # IoU calculation
#     iou = interArea / unionArea
#     return iou

In [None]:
# Function to compute IoU
def compute_iou(box1, box2):
    x1_inter = max(box1[0], box2[0])
    y1_inter = max(box1[1], box2[1])
    x2_inter = min(box1[2], box2[2])
    y2_inter = min(box1[3], box2[3])

    inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

# Example usage
# box1 = [x1, y1, x2, y2] format for predicted and ground truth boxes
iou_score = compute_iou(box1, box2)
print(f'IoU: {iou_score}')


11. Use pre-trained models such as YOLOv8 for object detection and print the IOU.


In [None]:
# Function to compute IoU
def compute_iou(box1, box2):
    x1_inter = max(box1[0], box2[0])
    y1_inter = max(box1[1], box2[1])
    x2_inter = min(box1[2], box2[2])
    y2_inter = min(box1[3], box2[3])

    inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

# Example usage
# box1 = [x1, y1, x2, y2] format for predicted and ground truth boxes
iou_score = compute_iou(box1, box2)
print(f'IoU: {iou_score}')

12. Experiment with pre-trained models and show the IOU of the test data set. Show
tables and graphs of how the results change.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Example data
results_data = {
    'Model': ['YOLOv3', 'Faster R-CNN', 'YOLOv8'],
    'IoU Score': [0.65, 0.70, 0.75]
}
results_df = pd.DataFrame(results_data)

# Plotting results
plt.bar(results_df['Model'], results_df['IoU Score'])
plt.xlabel('Model')
plt.ylabel('IoU Score')
plt.title('IoU Scores of Different Object Detection Models')
plt.show()

13. Submit a short report on the model you chose for Part 1 and why. Include the IOU results
in the report. Discuss your observations and the hyperparameters you used for the
model.
(Step 9-13 30 points)