In [280]:
import pandas as pd
import numpy as np
import torch

In [24]:
!pip install torch torchvision

Collecting torchvision
  Downloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.21.0


In [30]:
## Import dataset

## help from: https://www.learnpytorch.io/03_pytorch_computer_vision/, chatGPT

from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## Question - What other transforms (if any) should I use?

transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize images
])

path_train = "/Users/sarahhaddix/code/aiclub/jetsondrone/Training"
path_test = "/Users/sarahhaddix/code/aiclub/jetsondrone/Test"

train_data = datasets.ImageFolder(root=path_train, transform=transform)
test_data = datasets.ImageFolder(root=path_test, transform=transform)

In [36]:
len(train_data), len(test_data) # There are 39375 training images and 8617 test images

(39375, 8617)

In [82]:
type(train_data)

torchvision.datasets.folder.ImageFolder

In [46]:
## Create dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [42]:
## Hyper-parameters
batch_size = 32

In [148]:
## Use Apple Silicon GPU
# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(device)

mps


In [90]:
for images, labels in train_loader:
    print(images.shape) # ([32, 3, 254, 254])
    for i in range(images.size(0)):
        print(f"Image {i} dimensions: {images[i].shape}")
    break

# Images are 3x254x254

torch.Size([32, 3, 254, 254])
Image 0 dimensions: torch.Size([3, 254, 254])
Image 1 dimensions: torch.Size([3, 254, 254])
Image 2 dimensions: torch.Size([3, 254, 254])
Image 3 dimensions: torch.Size([3, 254, 254])
Image 4 dimensions: torch.Size([3, 254, 254])
Image 5 dimensions: torch.Size([3, 254, 254])
Image 6 dimensions: torch.Size([3, 254, 254])
Image 7 dimensions: torch.Size([3, 254, 254])
Image 8 dimensions: torch.Size([3, 254, 254])
Image 9 dimensions: torch.Size([3, 254, 254])
Image 10 dimensions: torch.Size([3, 254, 254])
Image 11 dimensions: torch.Size([3, 254, 254])
Image 12 dimensions: torch.Size([3, 254, 254])
Image 13 dimensions: torch.Size([3, 254, 254])
Image 14 dimensions: torch.Size([3, 254, 254])
Image 15 dimensions: torch.Size([3, 254, 254])
Image 16 dimensions: torch.Size([3, 254, 254])
Image 17 dimensions: torch.Size([3, 254, 254])
Image 18 dimensions: torch.Size([3, 254, 254])
Image 19 dimensions: torch.Size([3, 254, 254])
Image 20 dimensions: torch.Size([3, 254,

In [52]:
# Check the class indices
print(train_data.class_to_idx)  # Output: {'Fire': 0, 'No_Fire': 1}

{'Fire': 0, 'No_Fire': 1}


In [None]:
## MODELS ##

In [256]:
## Simple CNN from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
## originally for 10 category classification, added last linear layer
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5) # convolutional layer, takes 3 channel input, 6 output channels => outputs 6 feature maps. 5x5 kernel size
        self.pool = nn.MaxPool2d(2, 2) # 2x2 window with a stride of 2. Chooses max value in that 2x2 window and that becomes the output
        self.conv2 = nn.Conv2d(6, 16, 5) # conv, takes 6 channel input, 16 channel output, 5x5 kernel
        self.fc1 = nn.Linear(16 * 5 * 5 * 144, 120) # 16*5*5*144 -> 120
        self.fc2 = nn.Linear(120, 84) # 120 -> 84
        self.fc3 = nn.Linear(84, 10) # 84 -> 10
        self.fc4 = nn.Linear(10, 2) # 10 -> 2 (binary classification)

    # batch_size=32
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) # [(32, 6, 254, 254)] -> [(32, 6, 125, 125)]
        x = self.pool(F.relu(self.conv2(x))) # [(32, 6, 125, 125)] -> [(32, 16, 60, 60)]
        x = torch.flatten(x, 1) # flatten all dimensions except batch (becomes 32 1d vectors) [(32, 16, 60, 60)] -> [(32, 57600)]
        x = F.relu(self.fc1(x)) # [(32, 57600)] -> [(32, 120)]
        x = F.relu(self.fc2(x)) # [(32,120)] -> [(32, 84)]
        x = self.fc3(x) #[(32, 84)] -> [(32, 10)]
        x = self.fc4(x) #[(32, 10)] -> [(32, 2)]
        return x


net = Net().to(device) # send model to gpu

# n feature maps - convolutional layer will learn n filters (different kernels), each of which convolves over the image to produce 
# a different feature map. Size of kernel is 3rd parameter - i.e. the first layer uses a 5x5 kernel

# Max pooling is a down-sampling operation that reduces the size of the input feature maps while retaining the most important 
# information. It does this by sliding a window (or kernel) over the input and taking the maximum value within that window.


In [258]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [270]:
# Training loop
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0): # my guess is that there's 39375/32 = 1136.7 = 1137 mini-batches
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # Move inputs and labels to the MPS device
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        preds = torch.argmax(outputs, 1)
        loss = criterion(outputs, labels) # [(32, 2)] and [(32)]
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 300 == 0:    # print every 300 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
            print(labels)
            print(preds)

print('Finished Training')

[1,     1] loss: 0.000
tensor([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 1], device='mps:0')
tensor([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 1], device='mps:0')
[1,   301] loss: 0.003
tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 0, 0, 0], device='mps:0')
tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 0, 0, 0], device='mps:0')
[1,   601] loss: 0.003
tensor([0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1], device='mps:0')
tensor([0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1], device='mps:0')
[1,   901] loss: 0.003
tensor([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 1, 0, 0], device='mps:0')
tenso

In [266]:
PATH = './init_cnn_2.pth'
torch.save(net.state_dict(), PATH)

In [278]:
# from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

import matplotlib.pyplot as plt

# load net from saved weights
net = Net().to(device)
net.load_state_dict(torch.load('./init_cnn_2.pth', weights_only=True))

correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for i, data in enumerate(test_loader, 0): # doing testing in batches of 32
        images, labels = data # images: ([32, 3, 254, 254]), labels: ([32])
        inputs, labels = images.to(device), labels.to(device) # added myself
        # calculate outputs by running images through the network
        outputs = net(inputs) # ([32, 2])
        # the class with the highest energy is what we choose as prediction
        predicted = torch.argmax(outputs, 1) # predicted: ([32])
        total += labels.size(0) # (32)
        correct += (predicted == labels).sum().item()

        """
        # Visualization: Display the first few images in the batch
        num_images_to_display = 5  # Number of images to display
        if True:  # Only visualize the first batch
            plt.figure(figsize=(15, 5))
            for j in range(num_images_to_display):
                # Convert the image tensor to numpy for visualization
                image = images[j].cpu().numpy().transpose(1, 2, 0)  # Change from (C, H, W) to (H, W, C)
                image = (image - image.min()) / (image.max() - image.min())  # Normalize to [0, 1]
                
                plt.subplot(1, num_images_to_display, j + 1)
                plt.imshow(image)
                plt.title(f'True: {labels[j].item()}, Pred: {predicted[j].item()}')
                plt.axis('off')
            plt.show()
        """

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Accuracy of the network on the 10000 test images: 43 %


In [298]:
## Use Apple Silicon GPU
# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(device)

mps


In [339]:
## More complicated CNN - based on VGG-16 - https://medium.com/@siddheshb008/vgg-net-architecture-explained-71179310050f

# images are (32, 3, 254, 254), have been resized to (32, 3, 224, 224)
class CNN_2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1a = nn.Conv2d(3, 64, 3, padding='same') # 3 channel input, 64 channel output, 3x3 kernel
        self.conv1b = nn.Conv2d(64, 64, 3, padding='same')
        self.pool = nn.MaxPool2d(2, 2) # 2x2 window with a stride of 2. Chooses max value in that 2x2 window and that becomes the output
        self.conv2a = nn.Conv2d(64, 128, 3, padding='same') 
        self.conv2b = nn.Conv2d(128, 128, 3, padding='same')
        self.conv3a = nn.Conv2d(128, 256, 3, padding='same')
        self.conv3b = nn.Conv2d(256, 256, 3, padding='same')
        self.conv4a = nn.Conv2d(256, 512, 3, padding='same')
        self.conv4b = nn.Conv2d(512, 512, 3, padding='same')

        self.dropout = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(25088, 4096) 
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 1000) 
        self.fc4 = nn.Linear(1000, 2) # 10 -> 2 (binary classification)

    def forward(self, x):
        print('before layer 1')
        print(x.shape)
        x = self.pool(F.relu(self.conv1b(F.relu(self.conv1a(x))))) # layer 1
        print('after layer 1')
        print(x.shape)
        """
        print('xxxx')
        x = self.conv2a(x)
        print(x.shape)
        x = F.relu(x)
        print(x.shape)
        x = self.conv2b(x)
        print(x.shape)
        x = F.relu(x)
        print(x.shape)
        x = self.pool(x)
        print(x.shape)
        print('xxx')
        """
        x = self.pool(F.relu(self.conv2b(F.relu(self.conv2a(x))))) # layer 2
        print('after layer 2')
        print(x.shape)
        x = self.pool(F.relu(self.conv3b(F.relu(self.conv3b(F.relu(self.conv3a(x))))))) # layer 3
        print('after layer 3')
        print(x.shape)
        x = self.pool(F.relu(self.conv4b(F.relu(self.conv4b(F.relu(self.conv4a(x))))))) # layer 4
        print('after layer 4')
        print(x.shape)
        x = self.pool(F.relu(self.conv4b(F.relu(self.conv4b(F.relu(self.conv4b(x))))))) # layer 5

        print('before flatten')
        print(x.shape)
        x = torch.flatten(x, 1) # flatten all dimensions except batch (becomes 32 1d vectors) [(32, 16, 60, 60)] -> [(32, 57600)]
        print(x.shape)
        x = self.dropout(F.relu(self.fc1(x)))
        
        x = self.dropout(F.relu(self.fc2(x)))
        
        x = F.relu(self.fc3(x))
        
        x = self.fc4(x) 
        return x


net = CNN_2().to(device) # send model to gpu

# n feature maps - convolutional layer will learn n filters (different kernels), each of which convolves over the image to produce 
# a different feature map. Size of kernel is 3rd parameter - i.e. the first layer uses a 5x5 kernel

# Max pooling is a down-sampling operation that reduces the size of the input feature maps while retaining the most important 
# information. It does this by sliding a window (or kernel) over the input and taking the maximum value within that window.

In [310]:
# Data + transforms

## Import dataset

## help from: https://www.learnpytorch.io/03_pytorch_computer_vision/, chatGPT

from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## Question - What other transforms (if any) should I use?

transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize images
])

path_train = "/Users/sarahhaddix/code/aiclub/jetsondrone/Training"
path_test = "/Users/sarahhaddix/code/aiclub/jetsondrone/Test"

train_data = datasets.ImageFolder(root=path_train, transform=transform)
test_data = datasets.ImageFolder(root=path_test, transform=transform)

# There are 39375 training images and 8617 test images

## Create dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

## Hyper-parameters
batch_size = 32

In [312]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [341]:
# Training loop
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0): # my guess is that there's 39375/32 = 1136.7 = 1137 mini-batches
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # Move inputs and labels to the MPS device
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels) # [(32, 2)] and [(32)]
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 300 == 0:    # print every 300 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

before layer 1
torch.Size([32, 3, 224, 224])
after layer 1
torch.Size([32, 64, 112, 112])
after layer 2
torch.Size([32, 128, 56, 56])
after layer 3
torch.Size([32, 256, 28, 28])
after layer 4
torch.Size([32, 512, 14, 14])
before flatten
torch.Size([32, 512, 7, 7])
torch.Size([32, 25088])


RuntimeError: MPS backend out of memory (MPS allocated: 17.75 GB, other allocations: 4.08 MB, max allowed: 18.13 GB). Tried to allocate 392.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
## Fine-tuned ConvNet (https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)
# freezing all weights of ConvNet except for final fully connected layer

# License: BSD
# Author: Sasank Chilamkurthy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory

cudnn.benchmark = True
plt.ion()   # interactive mode

# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

"""
path_train = "/Users/sarahhaddix/code/aiclub/jetsondrone/Training"
path_test = "/Users/sarahhaddix/code/aiclub/jetsondrone/Test"

train_data = datasets.ImageFolder(root=path_train, transform=transform)
test_data = datasets.ImageFolder(root=path_test, transform=transform)
"""

data_dir = 'data/hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
## Resnet


In [None]:
## Pyramid vision transformer

In [None]:
## Decide serverity of fire? Also, images all seem to be in snow
# VGG architecutre
# BCEWithlogits -> use torch.round(torch.sigmoid(y_logits))
# Data augmentation - zoom in on fire, other augmentations
# conv, batchnorm2, relu, conv, batchnorm, maxpool2d (he did 2 of these, skip layer, 2 more)
# backbone - put it through the backbone first, then through other stuff. Mobilenet backbone might be pretty good
# VGA monitor - 

In [None]:
## Comparison of models
# first CNN fucking sucked