In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
!pip install info-nce-pytorch

Collecting info-nce-pytorch
  Downloading info_nce_pytorch-0.1.4-py3-none-any.whl (4.8 kB)
Installing collected packages: info-nce-pytorch
Successfully installed info-nce-pytorch-0.1.4


In [1]:
from transformers import AutoTokenizer, RobertaModel, AutoImageProcessor, ViTModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from info_nce import InfoNCE, info_nce

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

In [None]:
import pandas as pd
train_df = pd.read_parquet('/content/drive/MyDrive/train_full.csv')

In [None]:
train_sentences = []
train_labels = []
train_images = []

for index, row in train_df.iterrows():
  if row['image_url_status'] == 'Link exists and is accessible':
    train_sentences.append(row['text'])
    train_labels.append(row['likes'])
    train_images.append(row['image_url'])
  if row['video_url_status'] == 'Link exists and is accessible':
    train_sentences.append(row['text'])
    train_labels.append(row['likes'])
    train_images.append(row['video_thumbnail_url'])
  if row['gif_url_status'] == 'Link exists and is accessible':
    train_sentences.append(row['text'])
    train_labels.append(row['likes'])
    train_images.append(row['gif_thumbnail_url'])

In [None]:
image_encoder = RobertaModel.from_pretrained("roberta-base")
text_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
class CustomRegressionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = float(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

class SiameseNetwork(nn.Module):
    def __init__(self, image_encoder, text_encoder):
        super(SiameseNetwork, self).__init__()
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder

    def forward(self, image, text):
        image_emb = self.image_encoder(image)
        text_emb = self.text_encoder(text)
        return image_emb, text_emb

siamese_net = SiameseNetwork(image_encoder, text_encoder)

infoNCE_loss = InfoNCE()

optimizer = optim.Adam(siamese_net.parameters(), lr=0.001)

num_epochs = 10
data_loader = DataLoader(train_df, batch_size=32, shuffle=True)
for epoch in range(num_epochs):
    for batch in data_loader:
        images, texts = batch['images'], batch['texts']

        image_embeddings, text_embeddings = siamese_net(images, texts)

        loss = infoNCE_loss(image_embeddings, text_embeddings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")