In [1]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, transforms
from torchvision.models import inception_v3, Inception_V3_Weights
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
import matplotlib.pyplot as plt
import time
import os
import copy
import random
from PIL import Image
from torch.utils.data import (TensorDataset, 
                              Dataset, 
                              Subset,
                              random_split,
                              DataLoader,
                              RandomSampler, 
                              SequentialSampler, 
                              )
from transformers import BertTokenizer, BertForSequenceClassification
from models import initialize_vision_model, initialize_language_model
from GarbageUtils import GarbageDataset, split_dataset, GarbageImageFolder, append_value

import transformers

device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [2]:
# from torchvision.models import (inception_v3, 
#                                 Inception_V3_Weights,
#                                 efficientnet_b7, 
#                                 EfficientNet_B7_Weights, 
#                                 mobilenet_v2, 
#                                 MobileNet_V2_Weights)

In [3]:
# model_ft = mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT)
# model_ft

In [4]:
print(f'Device: {device}')
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
print("Transformers Version: ", transformers.__version__)

torch.cuda.empty_cache()



Device: cuda
PyTorch Version:  2.5.1+cu124
Torchvision Version:  0.20.1+cu124
Transformers Version:  4.46.3


In [5]:
data_dir = "./data"
vision_model_name = "mobilenet_v2"
language_model_name = "bert-base-uncased"
num_classes = 4
batch_size = 16 #4 #32 #16
epochs = 100
feature_extract = True

# remember to set your CSV_NAME and PATH to save model weights
learning_rate = 0.001
CSV_NAME = "100epochs_lr_0_001_mobilenet_v2_bs16.csv"
PATH = "100epochs_lr_0_001_mobilenet_v2_bs16.pth"

In [6]:
# if language_model_name == "bert-base-uncased":
#     out_features = 2052

In [7]:
if vision_model_name == "inception":
    out_features = 2052
elif vision_model_name == "efficientnet_b7":
    out_features = 2564
elif vision_model_name == "mobilenet_v2":
    out_features = 1284

In [8]:
vision_model, input_size = initialize_vision_model(vision_model_name, num_classes, feature_extract, multimodal=True)
language_model, tokenizer = initialize_language_model(language_model_name, num_classes, multimodal=True)


Initializing MobileNetV2 with weights=MobileNet_V2_Weights.DEFAULT ...
Input size = 224


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Initializing Bert-Base-Uncased...


In [9]:
vision_model

MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

In [10]:
def validate_image(path):
    try:
        im = Image.open(path)
        return True
    except:
        return False

In [11]:
image_dataset = GarbageImageFolder(data_dir, is_valid_file=validate_image)

In [12]:
image_dataset

Dataset GarbageImageFolder
    Number of datapoints: 5312
    Root location: ./data

In [13]:
classes =  image_dataset.classes
num_classes = len(classes)
print(classes)
print(f'Num of Classes: {num_classes}')

['black', 'blue', 'green', 'other']
Num of Classes: 4


In [14]:
image_dataset.class_to_idx

{'black': 0, 'blue': 1, 'green': 2, 'other': 3}

In [15]:
a = slice(-3, -1)
image_dataset[0:6]

[(<PIL.Image.Image image mode=RGB size=800x800 at 0x255D04A1CD0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x255CE0360A0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x255CE0369D0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x255C1DFC5B0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x255C0E1EA90>, 0),
 (<PIL.Image.Image image mode=RGB size=1734x1301 at 0x255C0E1EB50>, 0)]

In [16]:
image_dataset[a]

[(<PIL.Image.Image image mode=RGB size=1155x1600 at 0x255D04A3580>, 3),
 (<PIL.Image.Image image mode=RGB size=2615x3044 at 0x255D04A3640>, 3)]

In [17]:
train_set, val_set, test_set = split_dataset(image_dataset.imgs, test_size=0.2)

In [18]:
def get_dataloaders(input_size, train_set, val_set, test_set):
    from torchvision import datasets, transforms
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    
    train_set = GarbageDataset(train_set, is_subset=False, transform=data_transforms['train'])
    val_set = GarbageDataset(val_set,  is_subset=False, transform=data_transforms['val'])
    test_set = GarbageDataset(test_set,  is_subset=False, transform=data_transforms['val'])
    
    print("Loading data...")
    print(f'Train set size: {len(train_set)}')
    print(f'Val set size: {len(val_set)}')
    print(f'Test set size: {len(test_set)}')
    
    dataloaders_dict = {
        'train': DataLoader(train_set, batch_size = batch_size, shuffle=True, num_workers=4, drop_last=True),
        'val': DataLoader(val_set, batch_size = batch_size, shuffle=False, num_workers=4, drop_last=True)
    }
    
    test_dataloader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=4, drop_last=True)
    
#     print("Loading Datasets and Initializing DataLoaders...")
    return test_dataloader, dataloaders_dict

test_dataloader, dataloaders_dict = get_dataloaders(input_size, train_set, val_set, test_set)

Loading data...
Train set size: 3187
Val set size: 1062
Test set size: 1062


In [19]:
class MultiModalGarbageModel(torch.nn.Module):
    def __init__(self, num_classes, text_module, vision_module, text_module_name, vision_module_name,
                 out_features_combined, dropout_p=None):
        super(MultiModalGarbageModel, self).__init__()
        self.text_module = text_module
        self.vision_module = vision_module
        self.text_module_name = text_module_name
        self.vision_module_name = vision_module_name
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(out_features_combined, int(out_features_combined/2)),
            nn.ReLU(),
            nn.Linear(int(out_features_combined/2), num_classes),
        )

    def forward(self, vision_data, text_data, attention_mask):
        text_out, vision_out = 0, 0
        
        # get output from vision model
        if self.vision_module_name == "inception":
            self.vision_module.aux_logits = False
            vision_out = self.vision_module(vision_data)
        else:
            vision_out = self.vision_module(vision_data)

        # get output from text model    
        if self.text_module_name == "bert-base-uncased":
            out = self.text_module(text_data, attention_mask)
            text_out = out[0]
        else:
            text_out = self.text_module(text_data, attention_mask)
            
        combined = torch.cat((vision_out, text_out), dim=1)
        combined = combined.view(combined.size(0), -1)
        logits = self.linear_relu_stack(combined)
#         combined = self.fc(combined)
        
        return logits

In [20]:
model = MultiModalGarbageModel(num_classes, language_model, vision_model, language_model_name,
                                          vision_model_name, out_features)

In [21]:
def train_model(model, dataloaders, criterion, optimizer, epochs=25, is_inception=False, result_dict=None):
    since = time.time()
    
    val_acc_history = list()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(1, epochs+1):
#         print(f'Epoch {epoch+1}/{epochs}')
#         print("-" * 10)
        
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            
            #Iterate over the data # image_file, label, input_ids, attention_mask, file_name
            for inputs, labels, in_ids, att_mask in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                in_ids = in_ids.to(device)
                att_mask = att_mask.to(device)
                
                optimizer.zero_grad() # to zero the parameter gradients
                
                # forward, track history if only in train mode
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs, in_ids, att_mask)
                    loss = criterion(outputs, labels)
#                     if is_inception and phase == "train":
#                         outputs, aux_outputs = model(inputs)
#                         loss1 = criterion(outputs, labels)
#                         loss2 = criterion(aux_outputs, labels)
#                         loss = loss1 + 0.4*loss2
#                     else:
#                         outputs = model(inputs)
#                         loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    # backward, optimize only if in train mode
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                        
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
#             print(f'{phase} Loss: {epoch_loss} Accuracy: {epoch_acc}')
            print('-' * 59)
            print('| Epoch {:3d}/{:3d} | {} Loss: {:8.3f} | {} Accuracy {:8.3f} |'.format(
                epoch, epochs, phase, epoch_loss, phase, epoch_acc))
            print('-' * 59)
            
            if result_dict is not None:
                append_value(result_dict, "Epoch", epoch)
                append_value(result_dict, phase+" Accuracy", epoch_acc)
                append_value(result_dict, phase+" Loss", epoch_loss)
            
            # deepcopy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print()
        
    time_elapsed = time.time() - since
    print('Training Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best Validation Accuracy: {:.04f}'.format(best_acc))
    
    model.load_state_dict(best_model_wts)
    
    return model, val_acc_history

In [22]:
# send model to device: gpu or cpu
model = model.to(device)

params_to_update = model.parameters()
print("Parameters to learn:")
if feature_extract:
    params_to_update = list()
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t", name)
else:
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            print("\t", name)
            
optimizer_ft = optim.SGD(params_to_update, lr=learning_rate, momentum=0.9)

Parameters to learn:
	 text_module.bert.embeddings.word_embeddings.weight
	 text_module.bert.embeddings.position_embeddings.weight
	 text_module.bert.embeddings.token_type_embeddings.weight
	 text_module.bert.embeddings.LayerNorm.weight
	 text_module.bert.embeddings.LayerNorm.bias
	 text_module.bert.encoder.layer.0.attention.self.query.weight
	 text_module.bert.encoder.layer.0.attention.self.query.bias
	 text_module.bert.encoder.layer.0.attention.self.key.weight
	 text_module.bert.encoder.layer.0.attention.self.key.bias
	 text_module.bert.encoder.layer.0.attention.self.value.weight
	 text_module.bert.encoder.layer.0.attention.self.value.bias
	 text_module.bert.encoder.layer.0.attention.output.dense.weight
	 text_module.bert.encoder.layer.0.attention.output.dense.bias
	 text_module.bert.encoder.layer.0.attention.output.LayerNorm.weight
	 text_module.bert.encoder.layer.0.attention.output.LayerNorm.bias
	 text_module.bert.encoder.layer.0.intermediate.dense.weight
	 text_module.bert.encode

In [23]:
result_dict = {}
criterion = nn.CrossEntropyLoss()
model, hist = train_model(model, dataloaders_dict, criterion, optimizer_ft, epochs = 100, result_dict=result_dict,)

-----------------------------------------------------------
| Epoch   1/100 | train Loss:    1.256 | train Accuracy    0.425 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch   1/100 | val Loss:    1.155 | val Accuracy    0.494 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch   2/100 | train Loss:    1.112 | train Accuracy    0.543 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch   2/100 | val Loss:    1.007 | val Accuracy    0.582 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch   3/100 | train Loss:    0.989 | train Accuracy    0.601 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch   3

-----------------------------------------------------------
| Epoch  23/100 | val Loss:    0.753 | val Accuracy    0.689 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  24/100 | train Loss:    0.788 | train Accuracy    0.683 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  24/100 | val Loss:    0.745 | val Accuracy    0.692 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  25/100 | train Loss:    0.785 | train Accuracy    0.693 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  25/100 | val Loss:    0.745 | val Accuracy    0.699 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  26/10

-----------------------------------------------------------
| Epoch  46/100 | train Loss:    0.744 | train Accuracy    0.711 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  46/100 | val Loss:    0.746 | val Accuracy    0.702 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  47/100 | train Loss:    0.722 | train Accuracy    0.707 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  47/100 | val Loss:    0.725 | val Accuracy    0.718 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  48/100 | train Loss:    0.742 | train Accuracy    0.704 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  48

-----------------------------------------------------------
| Epoch  68/100 | val Loss:    0.712 | val Accuracy    0.718 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  69/100 | train Loss:    0.687 | train Accuracy    0.733 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  69/100 | val Loss:    0.710 | val Accuracy    0.716 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  70/100 | train Loss:    0.695 | train Accuracy    0.728 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  70/100 | val Loss:    0.723 | val Accuracy    0.722 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  71/10

-----------------------------------------------------------
| Epoch  91/100 | train Loss:    0.623 | train Accuracy    0.753 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  91/100 | val Loss:    0.689 | val Accuracy    0.729 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  92/100 | train Loss:    0.634 | train Accuracy    0.749 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  92/100 | val Loss:    0.693 | val Accuracy    0.724 |
-----------------------------------------------------------

-----------------------------------------------------------
| Epoch  93/100 | train Loss:    0.638 | train Accuracy    0.741 |
-----------------------------------------------------------
-----------------------------------------------------------
| Epoch  93

In [24]:
# for x in range(1, 1062):
#     if 1062%x == 0:
#         print(f'rem: {1062%x} --> {x}')

In [25]:
import numpy as np
import pandas as pd

In [26]:
model_copy = model

In [27]:
# CSV_NAME = "100epochs_lr_0.01.csv"
# PATH = "100_epochs_lr_0.01.pth"

In [28]:
torch.save(model.state_dict(), PATH)

In [29]:
result_dict2 = result_dict

In [30]:
result_dict2['Epoch'] = list(dict.fromkeys(result_dict2['Epoch']))

In [31]:
result_dict2['train Accuracy'] = list(acc.item() for acc in result_dict2['train Accuracy'])

In [32]:
result_dict2['val Accuracy'] = list(acc.item() for acc in result_dict2['val Accuracy'])

In [33]:
df = pd.DataFrame.from_dict(result_dict2)

In [34]:
df

Unnamed: 0,Epoch,train Accuracy,train Loss,val Accuracy,val Loss
0,1,0.425479,1.256091,0.494350,1.155301
1,2,0.542516,1.111588,0.581921,1.006653
2,3,0.600565,0.989055,0.679849,0.861571
3,4,0.617195,0.946398,0.660075,0.869395
4,5,0.632256,0.908349,0.669492,0.840908
...,...,...,...,...,...
95,96,0.761531,0.613430,0.724105,0.703347
96,97,0.765924,0.609029,0.721281,0.697134
97,98,0.765610,0.609600,0.712806,0.696553
98,99,0.754942,0.620260,0.731638,0.700342


In [35]:
df.to_csv(CSV_NAME)

In [36]:
model = MultiModalGarbageModel(num_classes, language_model, vision_model, language_model_name,
                                          vision_model_name, out_features)
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [None]:
#Imports for confusion matrix.
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import pandas as pd

In [2]:
def evaluate(test_loader):

    #actual vs predicted label list for confusion matrix
    actual_label = []
    predict_label = []

    #For incorrect lists.
    incorrect_samples = []
    incorrect_labels = []
    incorrect_pred_labels = []
    incorrect_filenames = []

    # Test loop
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad(): # inputs, labels, in_ids, att_mask
        for batch in test_loader:
            image_input, labels, input_ids, attention_mask = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            image_input = image_input.to(device)
            #add the actual label for confusion matrix
            actual_label.extend(labels.cpu().numpy())
            
            outputs = model(image_input, input_ids, attention_mask=attention_mask)
            logits = outputs
            _, predicted = torch.max(logits, 1)
            #add predicted label for confusion matrix
            predict_label.extend(labels.cpu().numpy())
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            #For each batch, store a mask of incorrect indices, then append the info to the lists
            incorrect_index = ((predicted == labels.view_as(predicted)) == False).view(-1)
            incorrect_samples.append(image_input[incorrect_index].cpu().numpy())
            incorrect_labels.append(labels[incorrect_index].cpu().numpy())
            incorrect_pred_labels.append(predicted[incorrect_index].cpu().numpy())

            #Get filenames using batch decode
            decoded_names = tokenizer.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            incorrect_filenames_by_batch = []

            #Store in incorrect_filenames based on the incorrect_index mask
            for i in range(len(incorrect_index)):
                if incorrect_index[i]:
                    incorrect_filenames_by_batch.append(decoded_names[i])
            incorrect_filenames.append(incorrect_filenames_by_batch)

    #Show the confusion matrix at the end.
    conf_matrix = confusion_matrix(actual_label, predict_label)
    ConfusionMatrixDisplay(conf_matrix).plot()

    #For misclassified labels
    fig = plt.figure(figsize=(8, 10))

    #Loop through first entry in each batch [idx][0]
    for idx in np.arange(5):
        ax = fig.add_subplot(3, 2, idx+1, xticks=[], yticks=[])
        misclassified_data = incorrect_samples[idx][0]
        misclassified_data = misclassified_data/2 + 0.5
        msc_image = misclassified_data
        msc_image = np.clip(msc_image, 0, 1)
        msc_image = msc_image.T
        plt.imshow(np.squeeze(msc_image))
        plt.title('actual_label: {0} predicted_label: {1}\n object name: {2} '
                  .format(incorrect_labels[idx][0], incorrect_pred_labels[idx][0], incorrect_filenames[idx][0]), wrap=True)
        
    plt.show()
            
    return correct/total

In [38]:
test = evaluate(test_dataloader)
test

0.7433712121212122

In [39]:
# model_copy.state_dict()

In [40]:
#lr = 0.01 - test acc = 0.6875
#lr = 0.001 - test acc = 0.82197
#lr = 0.0001 - test acc = 0.80208
#lr = 0.00001 - test acc = 0.58049

#lr = 0.001 - test acc = 0.81345 (batch_size = 32)
#lr = 0.001 - test acc = 0.6981 (batch_size = 8)

#lr = 0.001 - test acc = 0.74337(mobilenet-bert)