In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np

In [2]:
from dataset import ImageDataset, MultiModalDataset
from baseline_model import MultiModalModel

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import models, transforms

In [4]:
base_path = "/Users/santhosh.mohan/Downloads/DSCVAssessment/assignments/food_item_tag"

In [5]:
image_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}


### Training a image baseline model

In [6]:
def train_epoch(model, dataloader, loss_func, optim, device = 'cpu'):
    model.train()
    epoch_loss = 0
    total_size = 0
    for data, label in tqdm(dataloader):
        data = data.to(device)
        label = label.to(device)
        optim.zero_grad()
        with torch.set_grad_enabled(True):
            output = model(data)
            loss = loss_func(output, label)
            loss.backward()
            optim.step()
        epoch_loss += loss.item() * data.size(0)
        total_size += data.size(0)
    epoch_loss = epoch_loss / total_size
    print(f"Training Loss - {epoch_loss}")

def validate_epoch(model, dataloader, loss_func, device = 'cpu'):
    model.eval()
    epoch_loss = 0
    total_size = 0
    for data, label in tqdm(dataloader):
#         print("loading data")
        data = data.to(device)
        label = label.to(device)
        with torch.set_grad_enabled(False):
            output = model(data)
            loss = loss_func(output, label)
        epoch_loss += loss.item() * data.size(0)
        total_size += data.size(0)
    epoch_loss = epoch_loss / total_size
    print(f"Validation Loss - {epoch_loss}")

In [7]:
def train_image_model(device='cpu', epochs = 10):
    model = models.alexnet(pretrained=True)
    model.classifier[6] = nn.Linear(4096,48)
    model = model.to(device)
    loss_func = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[40,70, 90], gamma=0.15)
    training_dataset = ImageDataset(f"{base_path}/data/training_data.csv", f"{base_path}/imgs", image_transforms["train"])
    validation_dataset = ImageDataset(f"{base_path}/data/validation_data.csv", f"{base_path}/imgs", image_transforms["val"])


    training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=512,
                                                 shuffle=True, num_workers=0)
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=512,
                                                 shuffle=True, num_workers=0)
    for epoch in range(epochs):
        print(f"Epoch - {epoch+1}")
        train_epoch(model, training_dataloader, loss_func, optimizer, device)
        validate_epoch(model, validation_dataloader, loss_func, device)
        scheduler.step()
    return model

In [7]:
#model = train_image_model(epochs = 20)
#torch.save(model.state_dict(), "alexnet_model.pth")


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch - 1


100%|██████████| 16/16 [08:37<00:00, 32.36s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6627796832743854


100%|██████████| 4/4 [01:11<00:00, 17.98s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.46666581016472475
Epoch - 2


100%|██████████| 16/16 [08:19<00:00, 31.24s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.356268780942033


100%|██████████| 4/4 [01:10<00:00, 17.55s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.24686460186960957
Epoch - 3


100%|██████████| 16/16 [08:10<00:00, 30.67s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.24851744261190126


100%|██████████| 4/4 [01:09<00:00, 17.28s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.23318670434598948
Epoch - 4


100%|██████████| 16/16 [08:07<00:00, 30.47s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.23658489543130318


100%|██████████| 4/4 [01:09<00:00, 17.35s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.22195390247649527
Epoch - 5


100%|██████████| 16/16 [08:06<00:00, 30.41s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.22851377796147848


100%|██████████| 4/4 [01:10<00:00, 17.59s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.21693752944525976
Epoch - 6


100%|██████████| 16/16 [08:07<00:00, 30.44s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2232887710880212


100%|██████████| 4/4 [01:09<00:00, 17.35s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.21293167776221972
Epoch - 7


100%|██████████| 16/16 [08:04<00:00, 30.30s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.21992609373536373


100%|██████████| 4/4 [01:09<00:00, 17.39s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.20986032287687456
Epoch - 8


100%|██████████| 16/16 [10:13<00:00, 38.32s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2169149152951829


100%|██████████| 4/4 [01:11<00:00, 17.89s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2072060989424868
Epoch - 9


100%|██████████| 16/16 [08:18<00:00, 31.18s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.21370809626894835


100%|██████████| 4/4 [01:11<00:00, 17.99s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.20491184684051214
Epoch - 10


100%|██████████| 16/16 [08:18<00:00, 31.18s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2118384467074718


100%|██████████| 4/4 [01:10<00:00, 17.72s/it]

Validation Loss - 0.20288317450081003





### MultiModal Model

Since it is a baseline model, i am just using basing hashing for text features, also training image model separately

In [11]:
def train_multimodal_epoch(image_model, model, dataloader, loss_func, optim, device = 'cpu'):
    image_model.eval()
    model.train()
    epoch_loss = 0
    total_size = 0
    for image, text, label in tqdm(dataloader):
        image = image.to(device)
        text = text.to(device)
        label = label.to(device)
        optim.zero_grad()
        with torch.set_grad_enabled(False):
            image_features = image_model(image)
        
        with torch.set_grad_enabled(True):
            output = model(image_features, text)
            loss = loss_func(output, label)
            loss.backward()
            optim.step()
        epoch_loss += loss.item() * label.size(0)
        total_size += label.size(0)
    epoch_loss = epoch_loss / total_size
    print(f"Training Loss - {epoch_loss}")

def validate_multimodal_epoch(image_model, model, dataloader, loss_func, device = 'cpu'):
    image_model.eval()
    model.eval()
    epoch_loss = 0
    total_size = 0
    for image, text, label in tqdm(dataloader):
        image = image.to(device)
        text = text.to(device)
        label = label.to(device)     
        with torch.set_grad_enabled(False):
            image_features = image_model(image)
            output = model(image_features, text)
            loss = loss_func(output, label)
        epoch_loss += loss.item() * label.size(0)
        total_size += label.size(0)
    epoch_loss = epoch_loss / total_size
    print(f"Validation Loss - {epoch_loss}")

In [14]:
def train_multimodal_model(image_model, device='cpu',epochs = 10, resume_model = None):
    image_model = image_model.to(device)
    model = MultiModalModel(48, 61, 48)
    if resume_model:
        model.load_state_dict(resume_model)
    model = model.to(device)
    loss_func = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[40,70, 90], gamma=0.15)
    hashing = HashingVectorizer(n_features=20)
    df = pd.read_csv(f"{base_path}/data/training_data.csv")
    hashing.fit(df['name'])
    
    training_dataset = MultiModalDataset(f"{base_path}/data/training_data.csv", f"{base_path}/imgs", image_transforms["train"], hashing)
    validation_dataset = MultiModalDataset(f"{base_path}/data/validation_data.csv", f"{base_path}/imgs", image_transforms["val"], hashing)


    training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=512,
                                                 shuffle=True, num_workers=0)
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=512,
                                                 shuffle=True, num_workers=0)
    for epoch in range(epochs):
        print(f"Epoch - {epoch+1}")
        train_multimodal_epoch(image_model, model, training_dataloader, loss_func, optimizer, device)
        validate_multimodal_epoch(image_model, model, validation_dataloader, loss_func, device)
        scheduler.step()
    return model

In [13]:
img_model = models.alexnet(pretrained=True)
img_model.classifier[6] = nn.Linear(4096,48)
img_model.load_state_dict(torch.load("alexnet_model.pth"))
mul_model = train_multimodal_model(img_model, device='cpu',epochs = 10)


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch - 1


100%|██████████| 16/16 [04:47<00:00, 17.95s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6950360305638437


100%|██████████| 4/4 [01:13<00:00, 18.33s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6944368604269918
Epoch - 2


100%|██████████| 16/16 [05:04<00:00, 19.05s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.693379757616042


100%|██████████| 4/4 [01:13<00:00, 18.41s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6924056994663769
Epoch - 3


100%|██████████| 16/16 [04:46<00:00, 17.94s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6913678603568538


100%|██████████| 4/4 [01:13<00:00, 18.42s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6903084927914203
Epoch - 4


100%|██████████| 16/16 [04:51<00:00, 18.24s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.689373010467359


100%|██████████| 4/4 [01:11<00:00, 17.89s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6882869828980743
Epoch - 5


100%|██████████| 16/16 [04:41<00:00, 17.58s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6874613776891936


100%|██████████| 4/4 [01:11<00:00, 17.90s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6863417603483402
Epoch - 6


100%|██████████| 16/16 [04:39<00:00, 17.47s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6856024993832945


100%|██████████| 4/4 [01:11<00:00, 17.86s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6844421388715234
Epoch - 7


100%|██████████| 16/16 [04:41<00:00, 17.57s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6837722994232863


100%|██████████| 4/4 [01:16<00:00, 19.10s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6825617533634085
Epoch - 8


100%|██████████| 16/16 [04:46<00:00, 17.88s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6819646657696734


100%|██████████| 4/4 [01:20<00:00, 20.00s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6806964089757839
Epoch - 9


100%|██████████| 16/16 [04:48<00:00, 18.04s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6801467658149244


100%|██████████| 4/4 [01:14<00:00, 18.71s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.6788387785108787
Epoch - 10


100%|██████████| 16/16 [04:50<00:00, 18.15s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.6783427005847646


100%|██████████| 4/4 [01:19<00:00, 19.83s/it]

Validation Loss - 0.6769292623200996





In [15]:
torch.save(mul_model.state_dict(), "multimodal_model.pth")

In [19]:
model_state = torch.load("multimodal_model.pth")
mul_model = train_multimodal_model(img_model, device='cpu',epochs = 20,resume_model = model_state)


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch - 1


100%|██████████| 16/16 [05:00<00:00, 18.76s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.5145476912940407


100%|██████████| 4/4 [01:14<00:00, 18.70s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.49441881351497136
Epoch - 2


100%|██████████| 16/16 [04:55<00:00, 18.47s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.485993129421816


100%|██████████| 4/4 [01:19<00:00, 19.83s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.4573482977062394
Epoch - 3


100%|██████████| 16/16 [04:50<00:00, 18.13s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.4464530535154143


100%|██████████| 4/4 [01:15<00:00, 18.93s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.4124749640531097
Epoch - 4


100%|██████████| 16/16 [04:48<00:00, 18.06s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.400425701487874


100%|██████████| 4/4 [01:12<00:00, 18.01s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.3645025554366731
Epoch - 5


100%|██████████| 16/16 [04:45<00:00, 17.83s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.35297844756184243


100%|██████████| 4/4 [01:12<00:00, 18.03s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.3193858686853507
Epoch - 6


100%|██████████| 16/16 [04:42<00:00, 17.66s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.31047187184858627


100%|██████████| 4/4 [01:11<00:00, 17.84s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2828193954917546
Epoch - 7


100%|██████████| 16/16 [04:55<00:00, 18.45s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2782449026162297


100%|██████████| 4/4 [01:16<00:00, 19.01s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2573624418392067
Epoch - 8


100%|██████████| 16/16 [04:46<00:00, 17.88s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.25550123410561704


100%|██████████| 4/4 [01:13<00:00, 18.26s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.24188983880833229
Epoch - 9


100%|██████████| 16/16 [04:44<00:00, 17.80s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.24213184024011397


100%|██████████| 4/4 [01:13<00:00, 18.41s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2331401333766324
Epoch - 10


100%|██████████| 16/16 [04:53<00:00, 18.35s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2340275078956608


100%|██████████| 4/4 [01:10<00:00, 17.75s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2282502123364387
Epoch - 11


100%|██████████| 16/16 [04:37<00:00, 17.34s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.22959486723641323


100%|██████████| 4/4 [01:10<00:00, 17.75s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2253973614437523
Epoch - 12


100%|██████████| 16/16 [04:39<00:00, 17.44s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2266238707275578


100%|██████████| 4/4 [01:10<00:00, 17.74s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2235883618662886
Epoch - 13


100%|██████████| 16/16 [04:48<00:00, 18.01s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2244089657948928


100%|██████████| 4/4 [01:14<00:00, 18.71s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.22234849296639508
Epoch - 14


100%|██████████| 16/16 [04:53<00:00, 18.35s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.22319879189457847


100%|██████████| 4/4 [01:15<00:00, 18.95s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.22142990293476578
Epoch - 15


100%|██████████| 16/16 [04:45<00:00, 17.83s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2221170849552266


100%|██████████| 4/4 [01:15<00:00, 18.97s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2207065292445597
Epoch - 16


100%|██████████| 16/16 [04:43<00:00, 17.75s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.22125029252085956


100%|██████████| 4/4 [01:11<00:00, 17.76s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.22013161882471416
Epoch - 17


100%|██████████| 16/16 [04:42<00:00, 17.66s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.2205841438542618


100%|██████████| 4/4 [01:11<00:00, 17.78s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.21964947977936164
Epoch - 18


100%|██████████| 16/16 [04:40<00:00, 17.52s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.22018883529691644


100%|██████████| 4/4 [01:11<00:00, 17.82s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2192537302654207
Epoch - 19


100%|██████████| 16/16 [04:41<00:00, 17.60s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.21986340414983002


100%|██████████| 4/4 [01:12<00:00, 18.00s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

Validation Loss - 0.2189101742143318
Epoch - 20


100%|██████████| 16/16 [04:43<00:00, 17.70s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Training Loss - 0.21959111066183978


100%|██████████| 4/4 [01:12<00:00, 18.07s/it]

Validation Loss - 0.21862764035834864





In [20]:
torch.save(mul_model.state_dict(), "multimodal_model.pth")

In [23]:
hashing = HashingVectorizer(n_features=20)
df = pd.read_csv(f"{base_path}/data/training_data.csv")
hashing.fit(df['name'])

validation_dataset = MultiModalDataset(f"{base_path}/data/validation_data.csv", f"{base_path}/imgs", image_transforms["val"], hashing)

validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=512,
                                             shuffle=True, num_workers=0)


def validate_multimodal_epoch(image_model, model, dataloader, device = 'cpu'):
    image_model.eval()
    model.eval()
    PRED = []
    for image, text, label in tqdm(dataloader):
        image = image.to(device)
        text = text.to(device)
        label = label.to(device)     
        with torch.set_grad_enabled(False):
            image_features = image_model(image)
            output = model(image_features, text)
            print(output.shape)
            print(output.type)
            print(output)
            break
validate_multimodal_epoch(img_model, mul_model, validation_dataloader)

  0%|          | 0/4 [00:18<?, ?it/s]

torch.Size([512, 48])
<built-in method type of Tensor object at 0x7fae5131bcb0>
tensor([[-3.7887, -5.0942, -4.1025,  ..., -5.4269, -3.6412, -4.0881],
        [-2.7067, -3.6115, -2.9300,  ..., -3.8476, -2.5861, -2.8831],
        [-3.4922, -4.6908, -3.7853,  ..., -5.0030, -3.3606, -3.7574],
        ...,
        [-2.5565, -3.4077, -2.7526,  ..., -3.6117, -2.4274, -2.7098],
        [-3.6490, -4.9045, -3.9514,  ..., -5.2258, -3.5110, -3.9338],
        [-3.6611, -4.9119, -3.9616,  ..., -5.2395, -3.5195, -3.9461]])



