In [81]:
from os.path import join
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torchvision
import transformers
import torchvision

text_model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(text_model_name, do_lower_case=True)
text_model = transformers.BertModel.from_pretrained(text_model_name)
image_model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True)

Using cache found in /home/akkyma/.cache/torch/hub/pytorch_vision_master


In [99]:
im = Image.open("data/img/01236.png")

In [118]:
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize([64,64]),
    torchvision.transforms.ToTensor(),
])

In [133]:
arr = transform(im)
arr = arr.resize(1, *arr.size())

In [135]:
image_model(arr).size()

torch.Size([1, 1000])

In [138]:
text = "ananis"
text_input = tokenizer(text, return_tensors='pt')
out = text_model(**text_input)

In [195]:
class EnsembleModel(torch.nn.Module):
    def __init__(self):
        super(EnsembleModel,self).__init__()
        self.text_model = text_model
        self.image_model = image_model
        self.make_model_non_trainable(self.text_model)
        self.make_model_non_trainable(self.image_model)
        
        self.fc1 = torch.nn.Linear(1768, 512)
        self.act1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(512, 512)
        self.act2 = torch.nn.ReLU()
        self.fc_pred = torch.nn.Linear(512, 1)
        self.act_pred = torch.nn.Sigmoid()
     
    @staticmethod
    def make_model_non_trainable(model):
        for param in model.parameters():
            param.requires_grad = False
            
    def print_parameters(self):
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Total_params:          {total_params}\n"
              f"Trainable_params:      {trainable_params}\n"
              f"Non-trainable_params:  {total_params-trainable_params}")
        
    def forward(self, x):
        text_input = x['text']
        image_input = x['image']
        x1 = self.text_model(**text_input)[1]
        x2 = self.image_model(image_input)
        x = torch.cat([x1, x2], dim=-1)
        x = self.act1(self.fc1(x))
        x = self.act2(self.fc2(x))
        output = self.act_pred(self.fc_pred(x))
        
        return output

In [193]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, folder, file):
        self.folder = folder
        self.image_transform = torchvision.transforms.Compose([
            lambda x: x.convert('RGB'),
            torchvision.transforms.Resize([64,64]),
            torchvision.transforms.ToTensor(),
        ])
        self.data = pd.read_json(join(folder, file), lines=True).set_index('id')
        
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        image_filename = join(self.folder, self.data.iloc[idx, 0])
        image = Image.open(image_filename)
        image = self.image_transform(image)
        image = image.resize(1, *image.size())
        
        text = self.data.iloc[idx, 2]
        text = tokenizer(text, return_tensors='pt')
        
        label = self.data.iloc[idx, 1]
        
        return {'text': text, 'image': image, 'label': label}
    
train_dataset = Dataset("data", "train.jsonl")
dev_dataset  = Dataset("data", "dev.jsonl")

In [237]:
torch.nn.BCELoss()(torch.Tensor([0.5]).resize(1,1), torch.FloatTensor([0]))

tensor(0.6931)

In [244]:
model = EnsembleModel()

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(1):
    print(f"Epoch {epoch}:")
    for i, sample in enumerate(train_dataset):
        y_true = torch.Tensor([sample['label']])
        y_pred = model.forward(sample)
        
        loss = criterion(y_pred, y_true)
        if i % 100 == 0:
            print(f"Iteration {i}: {loss.item()}")
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch 0:
Iteration 0: 0.6792412996292114
Iteration 100: 0.2918693423271179
Iteration 200: 0.36071810126304626


KeyboardInterrupt: 

In [175]:
t1 = model.forward(train_dataset[0])
t2 = torch.ByteTensor([train_dataset[0]['label']])

