In [None]:
import os
import sys
os.environ['no_proxy']='localhost'
os.environ['http_proxy']='http://10.8.0.169:3128'
os.environ['https_proxy']='http://10.8.0.169:3128'

import re
import io
import glob
import base64
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

from torchvision.transforms import Compose, Resize, ToTensor, Normalize

from transformers import AutoProcessor, AutoModel


def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_id(filename):
    pattern = r'row\d_col\d'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def decode_base64_to_image(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    if image.mode in ('RGBA', 'P'):
        image = image.convert('RGB')
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image

class MultiLabelDataset(Dataset):
  def __init__(self, file_pattern, processor):
    tsv_list = glob.glob(file_pattern)
    df_list = []
    for i in tsv_list:
        temp = pd.read_csv(i, sep='\t')
        temp.loc[:, 'file'] = i
        temp.loc[:, 'group'] = get_group(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = get_id(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = temp[['group', 'group_patch']].apply(lambda x: f"{x['group']}_{x['group_patch']}", axis=1)
        df_list.append(temp)
    self.df = pd.concat(df_list, axis=0, ignore_index=True)

    size = processor.image_processor.size["height"]
    mean = processor.image_processor.image_mean
    std = processor.image_processor.image_std
    self.transform = Compose([
        Resize((size, size)),
        ToTensor(),
    ])

  def __getitem__(self, idx):
    image = decode_base64_to_image(self.df.loc[idx,'image'])
    texts = self.df.loc[idx,'question']
    inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
    return {key: val.squeeze(0) for key, val in inputs.items()}

  def __len__(self):
    return len(self.df)

file_pattern = '/home/srikapan/LMUData/RealWorldQA*.tsv'
train_dataset = MultiLabelDataset(file_pattern, processor)

batch_size = 32
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=16)

print(f'start inferencing')
scores = []
with torch.no_grad():
    for idx, batch in enumerate(tqdm(dataloader, desc="Processing Batches", unit="batch")):
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        logits_per_image = outputs.logits_per_image
        probs = torch.sigmoid(logits_per_image).cpu()
        scores.extend(torch.diag(probs).numpy().tolist())
        # if idx%100==0:
        #     print(f'idx: {idx}')
train_dataset.df.loc[:,'scores'] = scores
train_dataset.df.to_csv('res_siglip.csv', index=False)

In [None]:
import os
import sys
os.environ['no_proxy']='localhost'
os.environ['http_proxy']='http://10.8.0.169:3128'
os.environ['https_proxy']='http://10.8.0.169:3128'

import re
import io
import glob
import base64
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

from torchvision.transforms import Compose, Resize, ToTensor, Normalize

from transformers import CLIPProcessor, CLIPModel


def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_id(filename):
    pattern = r'row\d_col\d'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map=device)
#     attn_implementation="flash_attention_2",
#     )

def decode_base64_to_image(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    if image.mode in ('RGBA', 'P'):
        image = image.convert('RGB')
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image

class MultiLabelDataset(Dataset):
  def __init__(self, file_pattern, processor):
    tsv_list = glob.glob(file_pattern)
    df_list = []
    for i in tsv_list:
        temp = pd.read_csv(i, sep='\t')
        temp.loc[:, 'file'] = i
        temp.loc[:, 'group'] = get_group(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = get_id(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = temp[['group', 'group_patch']].apply(lambda x: f"{x['group']}_{x['group_patch']}", axis=1)
        df_list.append(temp)
    self.df = pd.concat(df_list, axis=0, ignore_index=True)

    size = processor.image_processor.crop_size["width"]
    mean = processor.image_processor.image_mean
    std = processor.image_processor.image_std
    self.transform = Compose([
        Resize((size, size)),
        ToTensor(),
    ])

  def __getitem__(self, idx):
    image = decode_base64_to_image(self.df.loc[idx,'image'])
    texts = self.df.loc[idx,'question']
    inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
    return {key: val.squeeze(0) for key, val in inputs.items()}

  def __len__(self):
    return len(self.df)

file_pattern = '/home/srikapan/LMUData/RealWorldQA*.tsv'
train_dataset = MultiLabelDataset(file_pattern, processor)

batch_size = 32
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=16)

print(f'start inferencing')
scores = []
with torch.no_grad():
    for idx, batch in enumerate(tqdm(dataloader, desc="Processing Batches", unit="batch")):
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        logits_per_image = outputs.logits_per_image
        probs = torch.sigmoid(logits_per_image).cpu()
        scores.extend(torch.diag(probs).numpy().tolist())
        # if idx%100==0:
        #     print(f'idx: {idx}')
train_dataset.df.loc[:,'scores'] = scores
train_dataset.df.to_csv('res_clip.csv', index=False)

In [2]:
import os
import sys
os.environ['no_proxy']='localhost'
os.environ['http_proxy']='http://10.8.0.169:3128'
os.environ['https_proxy']='http://10.8.0.169:3128'

import re
import io
import glob
import base64
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

from torchvision.transforms import Compose, Resize, ToTensor, Normalize

from transformers import AltCLIPModel, AltCLIPProcessor


def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_id(filename):
    pattern = r'row\d_col\d'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AltCLIPModel.from_pretrained("BAAI/AltCLIP", device_map=device)

def decode_base64_to_image(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    if image.mode in ('RGBA', 'P'):
        image = image.convert('RGB')
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image

class MultiLabelDataset(Dataset):
  def __init__(self, file_pattern, processor):
    tsv_list = glob.glob(file_pattern)
    df_list = []
    for i in tsv_list:
        temp = pd.read_csv(i, sep='\t')
        temp.loc[:, 'file'] = i
        temp.loc[:, 'group'] = get_group(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = get_id(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = temp[['group', 'group_patch']].apply(lambda x: f"{x['group']}_{x['group_patch']}", axis=1)
        df_list.append(temp)
    self.df = pd.concat(df_list, axis=0, ignore_index=True)

    size = processor.image_processor.crop_size["width"]
    mean = processor.image_processor.image_mean
    std = processor.image_processor.image_std
    self.transform = Compose([
        Resize((size, size)),
        ToTensor(),
    ])

  def __getitem__(self, idx):
    image = decode_base64_to_image(self.df.loc[idx,'image'])
    texts = self.df.loc[idx,'question']
    inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
    return {key: val.squeeze(0) for key, val in inputs.items()}

  def __len__(self):
    return len(self.df)

file_pattern = '/home/srikapan/LMUData/RealWorldQA*.tsv'
train_dataset = MultiLabelDataset(file_pattern, processor)

batch_size = 32
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=16)

print(f'start inferencing')
scores = []
with torch.no_grad():
    for idx, batch in enumerate(tqdm(dataloader, desc="Processing Batches", unit="batch")):
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        logits_per_image = outputs.logits_per_image
        probs = torch.sigmoid(logits_per_image).cpu()
        scores.extend(torch.diag(probs).numpy().tolist())
        # if idx%100==0:
        #     print(f'idx: {idx}')
train_dataset.df.loc[:,'scores'] = scores
train_dataset.df.to_csv('res_alt.csv', index=False)

preprocessor_config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

start inferencing




Processing Batches:   0%|          | 0/431 [00:00<?, ?batch/s]

In [1]:
import os
import sys
os.environ['no_proxy']='localhost'
os.environ['http_proxy']='http://10.8.0.169:3128'
os.environ['https_proxy']='http://10.8.0.169:3128'

import re
import io
import glob
import base64
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

from torchvision.transforms import Compose, Resize, ToTensor, Normalize

from transformers import AlignProcessor, AlignModel


def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_id(filename):
    pattern = r'row\d_col\d'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AlignModel.from_pretrained("kakaobrain/align-base", device_map=device)


def decode_base64_to_image(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    if image.mode in ('RGBA', 'P'):
        image = image.convert('RGB')
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image

class MultiLabelDataset(Dataset):
  def __init__(self, file_pattern, processor):
    tsv_list = glob.glob(file_pattern)
    df_list = []
    for i in tsv_list:
        temp = pd.read_csv(i, sep='\t')
        temp.loc[:, 'file'] = i
        temp.loc[:, 'group'] = get_group(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = get_id(os.path.basename(i).replace('.tsv', ''))
        temp.loc[:, 'group_patch'] = temp[['group', 'group_patch']].apply(lambda x: f"{x['group']}_{x['group_patch']}", axis=1)
        df_list.append(temp)
    self.df = pd.concat(df_list, axis=0, ignore_index=True)

    size = processor.image_processor.crop_size["width"]
    mean = processor.image_processor.image_mean
    std = processor.image_processor.image_std
    self.transform = Compose([
        Resize((size, size)),
        ToTensor(),
    ])

  def __getitem__(self, idx):
    image = decode_base64_to_image(self.df.loc[idx,'image'])
    texts = self.df.loc[idx,'question']
    inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
    return {key: val.squeeze(0) for key, val in inputs.items()}

  def __len__(self):
    return len(self.df)

file_pattern = '/home/srikapan/LMUData/RealWorldQA*.tsv'
train_dataset = MultiLabelDataset(file_pattern, processor)

batch_size = 32
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=16)

print(f'start inferencing')
scores = []
with torch.no_grad():
    for idx, batch in enumerate(tqdm(dataloader, desc="Processing Batches", unit="batch")):
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        logits_per_image = outputs.logits_per_image
        probs = torch.sigmoid(logits_per_image).cpu()
        scores.extend(torch.diag(probs).numpy().tolist())
        # if idx%100==0:
        #     print(f'idx: {idx}')
train_dataset.df.loc[:,'scores'] = scores
train_dataset.df.to_csv('res_align.csv', index=False)

preprocessor_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/690M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/690M [00:00<?, ?B/s]

start inferencing




Processing Batches:   0%|          | 0/431 [00:00<?, ?batch/s]