In [5]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import ViltProcessor, ViltModel
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 自定义数据集
class TimeSeriesDataset(Dataset):
    def __init__(self, images, texts, targets, processor):
        self.images = images
        self.texts = texts
        self.targets = targets
        self.processor = processor

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        encoding = self.processor(text=self.texts[idx],
                                  images=self.images[idx],
                                  return_tensors="pt")
        for key in encoding:
            encoding[key] = encoding[key].squeeze()
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return encoding, target

In [7]:

# 定义回归模型
class ViltForRegression(nn.Module):
    def __init__(self, base_model):
        super(ViltForRegression, self).__init__()
        self.vilt = base_model
        self.regressor = nn.Linear(self.vilt.config.hidden_size, 1)

    def forward(self, pixel_values, input_ids, attention_mask):
        outputs = self.vilt(pixel_values=pixel_values,
                            input_ids=input_ids,
                            attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        regression_output = self.regressor(pooled_output)
        return regression_output

In [8]:

# 加载处理器和模型
processor = ViltProcessor.from_pretrained('dandelin/vilt-b32-mlm')
base_model = ViltModel.from_pretrained('dandelin/vilt-b32-mlm')
model = ViltForRegression(base_model)

# 准备数据（假设 images, texts, targets 已准备好）
train_images, val_images, train_texts, val_texts, train_targets, val_targets = train_test_split(
    images, texts, targets, test_size=0.2, random_state=42
)

train_dataset = TimeSeriesDataset(train_images, train_texts, train_targets, processor)
val_dataset = TimeSeriesDataset(val_images, val_texts, val_targets, processor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        encoding, targets = batch
        pixel_values = encoding['pixel_values'].to(device)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask).squeeze()

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")

    # 验证
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            encoding, targets = batch
            pixel_values = encoding['pixel_values'].to(device)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            targets = targets.to(device)

            outputs = model(pixel_values=pixel_values,
                            input_ids=input_ids,
                            attention_mask=attention_mask).squeeze()

            loss = criterion(outputs, targets)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}")

# 评估
# 假设存在 test_loader
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
    for batch in test_loader:
        encoding, targets = batch
        pixel_values = encoding['pixel_values'].to(device)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        targets = targets.to(device)

        outputs = model(pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask).squeeze()

        all_preds.extend(outputs.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

mse = mean_squared_error(all_targets, all_preds)
mae = mean_absolute_error(all_targets, all_preds)
print(f"Test MSE: {mse}, Test MAE: {mae}")

# 预测函数
def predict(image, text, model, processor, device):
    encoding = processor(text=text, images=image, return_tensors="pt")
    pixel_values = encoding['pixel_values'].to(device)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        output = model(pixel_values=pixel_values,
                       input_ids=input_ids,
                       attention_mask=attention_mask).squeeze()
    return output.item()

# 示例预测
new_image = Image.open('path_to_new_image.jpg').convert('RGB')
new_text = "时序数据图像"
prediction = predict(new_image, new_text, model, processor, device)
print(f"预测值: {prediction}")

NameError: name 'images' is not defined

In [None]:

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        encoding, targets = batch
        pixel_values = encoding['pixel_values'].to(device)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask).squeeze()

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")

    # 验证
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            encoding, targets = batch
            pixel_values = encoding['pixel_values'].to(device)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            targets = targets.to(device)

            outputs = model(pixel_values=pixel_values,
                            input_ids=input_ids,
                            attention_mask=attention_mask).squeeze()

            loss = criterion(outputs, targets)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}")


NameError: name 'model' is not defined

In [None]:
# 评估
# 假设存在 test_loader
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
    for batch in test_loader:
        encoding, targets = batch
        pixel_values = encoding['pixel_values'].to(device)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        targets = targets.to(device)

        outputs = model(pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask).squeeze()

        all_preds.extend(outputs.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

mse = mean_squared_error(all_targets, all_preds)
mae = mean_absolute_error(all_targets, all_preds)
print(f"Test MSE: {mse}, Test MAE: {mae}")

# 预测函数
def predict(image, text, model, processor, device):
    encoding = processor(text=text, images=image, return_tensors="pt")
    pixel_values = encoding['pixel_values'].to(device)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        output = model(pixel_values=pixel_values,
                       input_ids=input_ids,
                       attention_mask=attention_mask).squeeze()
    return output.item()

# 示例预测
new_image = Image.open('path_to_new_image.jpg').convert('RGB')
new_text = "时序数据图像"
prediction = predict(new_image, new_text, model, processor, device)
print(f"预测值: {prediction}")