In [1]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, transforms
from torchvision.models import inception_v3, Inception_V3_Weights
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
import matplotlib.pyplot as plt
import time
import os
import copy
import random
from PIL import Image
from torch.utils.data import (TensorDataset, 
                              Dataset, 
                              Subset,
                              random_split,
                              DataLoader,
                              RandomSampler, 
                              SequentialSampler, 
                              )
from transformers import BertTokenizer, BertForSequenceClassification
from models import initialize_vision_model, initialize_language_model
from GarbageUtils import GarbageDataset, split_dataset, GarbageImageFolder



device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [2]:
print(f'Device: {device}')
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

torch.cuda.empty_cache()

Device: cuda
PyTorch Version:  1.13.1+cu117
Torchvision Version:  0.14.1+cu117


In [3]:
data_dir = "./data"
vision_model_name = "inception"
language_model_name = "bert-base-uncased"
num_classes = 4
batch_size = 64
epochs = 5
feature_extract = True

out_features = 2052

In [4]:
if language_model_name == "bert-base-uncased":
    out_features = 2052

In [5]:
vision_model, input_size, vision_out_features = initialize_vision_model(vision_model_name, num_classes, feature_extract, 
                                                                        use_pretrained=True)
language_model, tokenizer = initialize_language_model(language_model_name, num_classes, multimodal=True)


Initializing InceptionV3 with weights=Inception_V3_Weights.DEFAULT...
Input size = 299


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Initializing Bert-Base-Uncased...


In [6]:
vision_model

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [8]:
def validate_image(path):
    try:
        im = Image.open(path)
        return True
    except:
        return False

In [9]:
image_dataset = GarbageImageFolder(data_dir, is_valid_file=validate_image)

In [10]:
image_dataset

Dataset GarbageImageFolder
    Number of datapoints: 5312
    Root location: ./data

In [11]:
classes =  image_dataset.classes
num_classes = len(classes)
print(classes)
print(f'Num of Classes: {num_classes}')

['black', 'blue', 'green', 'other']
Num of Classes: 4


In [12]:
image_dataset.class_to_idx

{'black': 0, 'blue': 1, 'green': 2, 'other': 3}

In [13]:
a = slice(-3, -1)
image_dataset[0:6]

[(<PIL.Image.Image image mode=RGB size=800x800 at 0x1ADB446AAC0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x1ADB446A5B0>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x1ADB446A940>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x1ADB446AA90>, 0),
 (<PIL.Image.Image image mode=RGB size=800x800 at 0x1ADB446AB20>, 0),
 (<PIL.Image.Image image mode=RGB size=1734x1301 at 0x1ADB446A880>, 0)]

In [14]:
image_dataset[a]

[(<PIL.Image.Image image mode=RGB size=1155x1600 at 0x1ADB446A8E0>, 3),
 (<PIL.Image.Image image mode=RGB size=2615x3044 at 0x1ADB446AC10>, 3)]

In [15]:
train_set, val_set, test_set = split_dataset(image_dataset.imgs, test_size=0.2)

In [16]:
def get_dataloaders(input_size, train_set, val_set, test_set):
    from torchvision import datasets, transforms
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    
    train_set = GarbageDataset(train_set, is_subset=False, transform=data_transforms['train'])
    val_set = GarbageDataset(val_set,  is_subset=False, transform=data_transforms['val'])
    test_set = GarbageDataset(test_set,  is_subset=False, transform=data_transforms['val'])
    
    print("Loading data...")
    print(f'Train set size: {len(train_set)}')
    print(f'Val set size: {len(val_set)}')
    print(f'Test set size: {len(test_set)}')
    
    dataloaders_dict = {
        'train': DataLoader(train_set, batch_size = batch_size, shuffle=True, num_workers=4, drop_last=True),
        'val': DataLoader(val_set, batch_size = batch_size, shuffle=False, num_workers=4, drop_last=True)
    }
    
    test_dataloader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=4, drop_last=True)
    
#     print("Loading Datasets and Initializing DataLoaders...")
    return test_dataloader, dataloaders_dict

test_dataloader, dataloaders_dict = get_dataloaders(input_size, train_set, val_set, test_set)

Loading data...
Train set size: 3187
Val set size: 1062
Test set size: 1062


In [19]:
class MultiModalGarbageModel(torch.nn.Module):
    def __init__(self, num_classes, text_module, vision_module, text_module_name, vision_module_name,
                 out_features_combined, dropout_p=None):
        super(MultiModalGarbageModel, self).__init__()
        self.text_module = text_module
        self.vision_module = vision_module
        self.text_module_name = text_module_name
        self.vision_module_name = vision_module_name
        self.fc = torch.nn.Linear(out_features_combined, num_classes)
#         self.dropout = torch.nn.Dropout(dropout_p)

    def forward(self, vision_data, text_data, attention_mask):
        text_out, vision_out = 0, 0
        
        # get output from vision model
        if self.vision_module_name == "inception":
            self.vision_module.aux_logits = False
            vision_out = self.vision_module(vision_data)

        # get output from text model    
        if self.text_module_name == "bert-base-uncased":
            out = self.text_module(text_data, attention_mask)
            text_out = out[0]
        else:
            text_out = self.text_module(text_data, attention_mask)
            
        combined = torch.cat((vision_out, text_out), dim=1)
        combined = combined.view(combined.size(0), -1)
        combined_out = self.fc(combined)
        
        return combined_out

In [20]:
model = MultiModalGarbageModel(num_classes, language_model, vision_model, language_model_name,
                                          vision_model_name, out_features)

In [21]:
def train_model(model, dataloaders, criterion, optimizer, epochs=25, is_inception=False):
    since = time.time()
    
    val_acc_history = list()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(epochs):
        print(f'Epoch {epoch}/{epochs - 1}')
        print("-" * 10)
        
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            
            #Iterate over the data # image_file, label, input_ids, attention_mask, file_name
            for inputs, labels, in_ids, att_mask in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                in_ids = in_ids.to(device)
                att_mask = att_mask.to(device)
                
#                 all_inputs = {
#                     "input_images": inputs,
#                     "in_ids": in_ids,
#                     "att_mask": att_mask,
#                     "labels": labels,
#                 }
                
                optimizer.zero_grad() # to zero the parameter gradients
                
                # forward, track history if only in train mode
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs, in_ids, att_mask)
                    loss = criterion(outputs, labels)
#                     if is_inception and phase == "train":
#                         outputs, aux_outputs = model(inputs)
#                         loss1 = criterion(outputs, labels)
#                         loss2 = criterion(aux_outputs, labels)
#                         loss = loss1 + 0.4*loss2
#                     else:
#                         outputs = model(inputs)
#                         loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    # backward, optimize only if in train mode
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                        
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
            print(f'{phase} Loss: {epoch_loss} Accuracy: {epoch_acc}')
            
            # deepcopy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print()
        
    time_elapsed = time.time() - since
    print('Training Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best Validation Acuuracy: {:.04f}'.format(best_acc))
    
    model.load_state_dict(best_model_wts)
    
    return model, val_acc_history

In [22]:
# def train_model(model, dataloaders, criterion, optimizer, epochs=25, is_inception=False):
#     since = time.time()
    
#     val_acc_history = list()
    
#     best_model_wts = copy.deepcopy(model.state_dict())
#     best_acc = 0.0
    
#     for epoch in range(epochs):
#         print(f'Epoch {epoch}/{epochs - 1}')
#         print("-" * 10)
        
#         for phase in ["train", "val"]:
#             if phase == "train":
#                 model.train()
#             else:
#                 model.eval()
#             running_loss = 0.0
#             running_corrects = 0
            
#             #Iterate over the data # image_file, label, input_ids, attention_mask, file_name
#             for inputs, labels, _, _ in dataloaders[phase]:
#                 inputs = inputs.to(device)
#                 labels = labels.to(device)
                
#                 optimizer.zero_grad() # to zero the parameter gradients
                
#                 # forward, track history if only in train mode
#                 with torch.set_grad_enabled(phase == "train"):
#                     if is_inception and phase == "train":
#                         outputs, aux_outputs = model(inputs)
#                         loss1 = criterion(outputs, labels)
#                         loss2 = criterion(aux_outputs, labels)
#                         loss = loss1 + 0.4*loss2
#                     else:
#                         outputs = model(inputs)
#                         loss = criterion(outputs, labels)
                    
#                     _, preds = torch.max(outputs, 1)
                    
#                     # backward, optimize only if in train mode
#                     if phase == "train":
#                         loss.backward()
#                         optimizer.step()
                        
#                 # statistics
#                 running_loss += loss.item() * inputs.size(0)
#                 running_corrects += torch.sum(preds == labels.data)
                
#             epoch_loss = running_loss / len(dataloaders[phase].dataset)
#             epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
#             print(f'{phase} Loss: {epoch_loss} Accuracy: {epoch_acc}')
            
#             # deepcopy the model
#             if phase == 'val' and epoch_acc > best_acc:
#                 best_acc = epoch_acc
#                 best_model_wts = copy.deepcopy(model.state_dict())
#             if phase == 'val':
#                 val_acc_history.append(epoch_acc)
#         print()
        
#     time_elapsed = time.time() - since
#     print('Training Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
#     print('Best Validation Acuuracy: {:.04f}'.format(best_acc))
    
#     model.load_state_dict(best_model_wts)
    
#     return model, val_acc_history

In [24]:
# send model to device: gpu or cpu
model = model.to(device)

params_to_update = model.parameters()
print("Parameters to learn:")
if feature_extract:
    params_to_update = list()
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t", name)
else:
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            print("\t", name)
            
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

Parameters to learn:
	 text_module.bert.embeddings.word_embeddings.weight
	 text_module.bert.embeddings.position_embeddings.weight
	 text_module.bert.embeddings.token_type_embeddings.weight
	 text_module.bert.embeddings.LayerNorm.weight
	 text_module.bert.embeddings.LayerNorm.bias
	 text_module.bert.encoder.layer.0.attention.self.query.weight
	 text_module.bert.encoder.layer.0.attention.self.query.bias
	 text_module.bert.encoder.layer.0.attention.self.key.weight
	 text_module.bert.encoder.layer.0.attention.self.key.bias
	 text_module.bert.encoder.layer.0.attention.self.value.weight
	 text_module.bert.encoder.layer.0.attention.self.value.bias
	 text_module.bert.encoder.layer.0.attention.output.dense.weight
	 text_module.bert.encoder.layer.0.attention.output.dense.bias
	 text_module.bert.encoder.layer.0.attention.output.LayerNorm.weight
	 text_module.bert.encoder.layer.0.attention.output.LayerNorm.bias
	 text_module.bert.encoder.layer.0.intermediate.dense.weight
	 text_module.bert.encode

In [27]:
criterion = nn.CrossEntropyLoss()
model, hist = train_model(model, dataloaders_dict, criterion, optimizer_ft, epochs = epochs,)

Epoch 0/4
----------
train Loss: 1.23067207922934 Accuracy: 0.4424223407593348
val Loss: 1.0744725099840182 Accuracy: 0.5423728813559322

Epoch 1/4
----------
train Loss: 1.0817225982598986 Accuracy: 0.5472230938186382
val Loss: 0.978732132417783 Accuracy: 0.6054613935969868

Epoch 2/4
----------
train Loss: 1.0127376895134899 Accuracy: 0.5889551302165045
val Loss: 0.9199248922746733 Accuracy: 0.6177024482109228

Epoch 3/4
----------
train Loss: 0.9506085111236812 Accuracy: 0.615625980545968
val Loss: 0.8956581625785756 Accuracy: 0.6224105461393596

Epoch 4/4
----------
train Loss: 0.936298864703961 Accuracy: 0.6262943206777534
val Loss: 0.867582985685819 Accuracy: 0.6271186440677966

Training Complete in 27m 34s
Best Validation Acuuracy: 0.6271


In [None]:
# for x in range(1, 1062):
#     if 1062%x == 0:
#         print(f'rem: {1062%x} --> {x}')