In [118]:
from transformers import BertTokenizer, BertModel
device = "cuda:0"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_model = BertModel.from_pretrained("bert-base-uncased").to(device)
for param in text_model.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [93]:
import torch 
import torchvision
import torch.nn as nn
img_model = torchvision.models.resnet18(weights='DEFAULT', )
img_model.fc = nn.Sequential(*list(img_model.fc.children())[:-3])
for param in img_model.parameters():
    param.requires_grad = False
img_model = img_model.to(device)

In [58]:
import torch.nn as nn
import torch.nn.functional as F

class RegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(512+768, 512)
        self.fc2 = nn.Linear(512, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [60]:
'''
import json

with open('./CNN.jsonl', 'r') as json_file:
    json_list = list(json_file)
views = []
titles = []
lengths = []
images = []
times = []
time_region = ['{} months'.format(x) for x in range(2,12)]
mean_views = []

idx = 0
flag = False
for json_str in json_list:
    result = json.loads(json_str)
    if result['upload_time'] == time_region[idx]:
        views.append(float(result['views']))
        titles.append(result['title'])
        lengths.append(result['length'])
        images.append(result['image'])
        times.append(result['upload_time'])
        flag = True
    else:
        if len(views)!=0:
            mean_views.append(sum(views)/len(views))
        views = []
        titles = []
        lengths = []
        images = []
        times = []
        if flag:
            idx+=1
            flag = False
'''

In [5]:
import json

with open('./CNN.jsonl', 'r') as json_file:
    json_list = list(json_file)
views = []
titles = []
lengths = []
images = []
times = []

for json_str in json_list:
    result = json.loads(json_str)
    if result['image']!=None:
        views.append(float(result['views']))
        titles.append(result['title'])
        lengths.append(result['length'])
        images.append(result['image'])
        times.append(result['upload_time'])

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import requests

class ImageDataset(Dataset):
    def __init__(self, image_urls):
        self.image_urls = image_urls
    def __len__(self):
        return len(self.image_urls)
    def __getitem__(self, idx):
        img = Image.open(requests.get(self.image_urls[idx], stream=True).raw)
        ratio = 224 / min(img.size)
        transform = transforms.Compose([
            transforms.Resize(size=(int(img.size[0]*ratio),int(img.size[1]*ratio))),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
        ])
        return transform(img)

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoded_input = tokenizer(self.texts[idx], max_length = 512, padding = 'max_length',return_tensors='pt')
        return encoded_input

class TargetDataset(Dataset):
    def __init__(self, views):
        self.views = []
        max_views = max(views)
        for view in views:
            self.views.append(view/max_views)
        self.views = torch.tensor(self.views)
    def __len__(self):
        return len(self.views)
    def __getitem__(self, idx):
        return self.views[idx]

In [141]:
img_dataset = ImageDataset(images)
text_dataset = TextDataset(titles)
target_dataset = TargetDataset(views)
batch_size = 16
class ConcatDataset(Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets
    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)
    def __len__(self):
        return min(len(d) for d in self.datasets)
train_loader = DataLoader(
             ConcatDataset(
                img_dataset,
                text_dataset,
                target_dataset
             ),
             batch_size=batch_size, shuffle=True,)

In [146]:
from tqdm import tqdm
regress_model = RegressionModel().to(device)
lr = 0.01
num_epochs = 10
optimizer = torch.optim.SGD(regress_model.parameters(),lr=lr, momentum=0.9,weight_decay=5e-4)
criterian = torch.nn.MSELoss()

In [147]:
for epoch in range(num_epochs):
    print(f'Epoch {epoch}/{num_epochs - 1}')
    print('-' * 10)
    running_loss = 0.0

    p = tqdm(total=len(train_loader), position=0, leave=True)
    for i, (imgs, encoded_inputs, targets) in enumerate(train_loader):
        p.update(1)
        input_ids = torch.squeeze(encoded_inputs['input_ids']).to(device)
        attention_mask = torch.squeeze(encoded_inputs['attention_mask']).to(device)

        text_embeddings = text_model(input_ids, attention_mask)['pooler_output'].to(device)
        targets = targets.to(device)
        imgs = imgs.to(device)
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            img_embeddings = img_model(imgs)
            regress_input = torch.cat((img_embeddings, text_embeddings), axis = 1)
            outputs = regress_model(regress_input)[:,0]
            
            loss = criterian(outputs, targets)
            loss.backward()
            optimizer.step()
        running_loss += loss.item() * regress_input.size(0)
    p.close()
    print(f'Loss: {running_loss / len(target_dataset):.4f}')

    #if epoch in milestone:
    #    l = optimizer.param_groups[0]["lr"]
    #    optimizer.param_groups[0]["lr"]*=lr_decay_gamma
    #    print('Learning rate changes from {} into {}'.format(l, optimizer.param_groups[0]["lr"]))

Epoch 0/9
----------


  6%|▌         | 17/278 [00:14<03:40,  1.18it/s]
100%|██████████| 278/278 [03:07<00:00,  1.48it/s]


Loss: 0.0072
Epoch 1/9
----------


 45%|████▌     | 126/278 [01:26<01:45,  1.44it/s]

KeyboardInterrupt: 