In [22]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, transforms
from torchvision.models import inception_v3, Inception_V3_Weights
import matplotlib.pyplot as plt
import time
import os
import copy
import random
from PIL import Image
from torch.utils.data import (TensorDataset, 
                              Dataset, 
                              Subset,
                              random_split, 
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler, 
                              DataLoader)
from transformers import BertTokenizer, BertForSequenceClassification
# from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
from models import initialize_vision_model
from utils import GarbageDataset, split_dataset, GarbageFolder



device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [23]:
print(f'Device: {device}')
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

Device: cuda
PyTorch Version:  1.13.1+cu117
Torchvision Version:  0.14.1+cu117


In [24]:
data_dir = "./data"
model_name = "inception"
num_classes = 4
batch_size_train = 64
batch_size_val = 64
epochs = 5
feature_extract = True

In [25]:
def validate_image(path):
    try:
        im = Image.open(path)
        return True
    except:
        return False

In [26]:
image_dataset = datasets.ImageFolder(data_dir, is_valid_file=validate_image)

In [27]:
image_dataset

Dataset ImageFolder
    Number of datapoints: 5312
    Root location: ./data

In [28]:
classes =  image_dataset.classes
num_classes = len(classes)
print(classes)
print(f'Num of Classes: {num_classes}')

['black', 'blue', 'green', 'other']
Num of Classes: 4


In [29]:
image_dataset.class_to_idx

{'black': 0, 'blue': 1, 'green': 2, 'other': 3}

In [30]:
language_dataset = GarbageDataset(image_dataset, text_data=True)
language_dataset.garbage_data

Dataset ImageFolder
    Number of datapoints: 5312
    Root location: ./data

In [31]:
train_set, val_set, test_set = split_dataset(image_dataset, test_size=0.2)

Train size: 3188
Val size: 1062
Test size: 1062


In [32]:
type(train_set)

torch.utils.data.dataset.Subset

In [33]:
def get_vision_dataloaders(input_size, train_set, val_set, test_set):
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    
    train_set = GarbageDataset(train_set, data_transforms['train'])
    val_set = GarbageDataset(val_set, data_transforms['val'])
    test_set = GarbageDataset(test_set, data_transforms['val'])
    
    dataloaders_dict = {
        'train': DataLoader(train_set, batch_size=batch_size_train, shuffle=True, num_workers=4),
        'val': DataLoader(val_set, batch_size=batch_size_val, shuffle=True, num_workers=4)
    }
    
    test_dataloader = DataLoader(test_set, batch_size=batch_size_val, shuffle=True, num_workers=4)
    
    print("Loading Datasets and Initializing DataLoaders...")
    return test_dataloader, dataloaders_dict

In [34]:
def get_text_data(train_set, val_set, test_set):

    train_indices = train_set.indices
    val_indices = val_set.indices
    test_indices = test_set.indices
    
    train_subset = Subset(language_dataset, train_indices)
    val_subset = Subset(language_dataset, val_indices)
    test_subset = Subset(language_dataset, test_indices)
    
    print("Before...")
    print(f'Train set size: {len(train_set)}')
    print(f'Val set size: {len(val_set)}')
    print(f'Test set size: {len(test_set)}')
    
    train_set = {
        "lines": [train_subset.dataset.filenames[i] for i in train_indices],
        "labels": [train_subset.dataset.labels[i] for i in train_indices]
    }
    val_set = {
        "lines": [val_subset.dataset.filenames[i] for i in val_indices],
        "labels": [val_subset.dataset.labels[i] for i in val_indices]
    }
    test_set = {
        "lines": [test_subset.dataset.filenames[i] for i in test_indices],
        "labels": [test_subset.dataset.labels[i] for i in test_indices]
    }

    print("Loading language data...")
    print(f'Train set size: {len(train_set["lines"])}')
    print(f'Val set size: {len(val_set["lines"])}')
    print(f'Test set size: {len(test_set["lines"])}')
    
    return train_set, val_set, test_set


train_text, val_text, test_text = get_text_data(train_set, val_set, test_set)

Before...
Train set size: 3188
Val set size: 1062
Test set size: 1062
Loading language data...
Train set size: 3188
Val set size: 1062
Test set size: 1062


In [35]:
train_text["lines"][:10]

['cardboard product label',
 'apple',
 'paper catalogue',
 'plastic chocolate wrapper',
 'broken network ethernet switch',
 'medicine',
 'used pasta glass container',
 'empty shrimp bag',
 'cardboardـboxـofـtylenolـtablets',
 'candy wrapper']

In [36]:
class MultiModalGarbageModel(torch.nn.Module):
    def __init__(self, num_classes, loss_fn, text_module, image_module, 
                 text_feat_dim, image_feat_dim, fusion_output_size, dropout_p):
        super(MultiModalGarbageModel, self).__init__()
        self.text_module = text_module
        self.image_module = image_module
        self.fusion = torch.nn.Linear((text_feat_dim + image_feat_dim), fusion_output_size)
        self.fc = torch.nn.Linear(fusion_output_size, num_classes)
        self.criterion = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [37]:
model, input_size = initialize_vision_model(model_name, num_classes, feature_extract, use_pretrained=True)

Initializing InceptionV3 with weights=Inception_V3_Weights.DEFAULT...
Input size = 299
out_features = 4


In [38]:
print(model)

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [39]:
def train_model(model, dataloaders, criterion, optimizer, epochs=25, is_inception=False):
    since = time.time()
    
    val_acc_history = list()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(epochs):
        print(f'Epoch {epoch}/{epochs - 1}')
        print("-" * 10)
        
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            
            #Iterate over the data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad() # to zero the parameter gradients
                
                # forward, track history if only in train mode
                with torch.set_grad_enabled(phase == "train"):
                    if is_inception and phase == "train":
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    # backward, optimize only if in train mode
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                        
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
            print(f'{phase} Loss: {epoch_loss} Accuracy: {epoch_acc}')
            
            # deepcopy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print()
        
    time_elapsed = time.time() - since
    print('Training Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best Validation Acuuracy: {:.04f}'.format(best_acc))
    
    model.load_state_dict(best_model_wts)
    
    return model, val_acc_history

In [40]:
test_dataloader, dataloaders_dict = get_vision_dataloaders(input_size, train_set, val_set, test_set)

Loading Datasets and Initializing DataLoaders...


In [41]:
# send model to device: gpu or cpu
model = model.to(device)

params_to_update = model.parameters()
print("Parameters to learn:")
if feature_extract:
    params_to_update = list()
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t", name)
else:
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            print("\t", name)
            
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

Parameters to learn:
	 AuxLogits.fc.weight
	 AuxLogits.fc.bias
	 fc.weight
	 fc.bias


In [None]:
criterion = nn.CrossEntropyLoss()

model, hist = train_model(model, dataloaders_dict, criterion, optimizer_ft, epochs = epochs, 
                             is_inception = (model_name=="inception"))

Epoch 0/4
----------
train Loss: 1.7914608100725986 Accuracy: 0.45294855708908405
val Loss: 1.1710606894250644 Accuracy: 0.5056497175141242

Epoch 1/4
----------


In [None]:
len(train_set)

In [None]:
len(val_set)

In [None]:
len(train_set)/69

In [None]:
len(val_set)/1

In [None]:
3193/31

In [None]:
for x in range(1, 3193):
    if 3193%x == 0:
        print(f'rem: {3193%x} --> {x}')

In [None]:
for x in range(1, 1064):
    if 1064%x == 0:
        print(f'rem: {1064%x} --> {x}')

In [None]:
from torchvision.datasets import default_loader