In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
import h5py
from tqdm import tqdm

class CLIPEmbeddingDataset(Dataset):
    def __init__(self, h5_file, tokenizer, max_length=512):
        self.h5_file = h5_file
        self.tokenizer = tokenizer
        self.max_length = max_length
        with h5py.File(self.h5_file, 'r') as hf:
            self.length = len(hf['image_embeddings'])

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        with h5py.File(self.h5_file, 'r') as hf:
            clip_embedding = hf['image_embeddings'][idx]
            text = hf['text'][idx]
            
            clip_embedding = torch.tensor(clip_embedding, dtype=torch.float32)
            
            if isinstance(text, bytes):
                text = text.decode('utf-8')

        encoded_text = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'clip_embedding': clip_embedding,
            'input_ids': encoded_text['input_ids'].squeeze(),
            'attention_mask': encoded_text['attention_mask'].squeeze()
        }

class ProjectionLayer(nn.Module):
    def __init__(self, clip_dim=512, phi_dim=2048):
        super().__init__()
        self.projection = nn.Linear(clip_dim, phi_dim)

    def forward(self, x):
        return self.projection(x)

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    h5_file_path = 'clip_embeddings_150k.h5'
    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
    tokenizer.pad_token = tokenizer.eos_token
    
    phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16).to(device)
    phi_embed = phi_model.model.embed_tokens

    dataset = CLIPEmbeddingDataset(h5_file_path, tokenizer)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

    projection_layer = ProjectionLayer().to(device)
    optimizer = torch.optim.AdamW(projection_layer.parameters(), lr=1e-4)
    cos_sim = nn.CosineSimilarity(dim=1)

    num_epochs = 5
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            clip_embeddings = batch['clip_embedding'].to(device)
            input_ids = batch['input_ids'].to(device)
            
            with torch.no_grad():
                phi_embeddings = phi_embed(input_ids)[:, 0, :]
            
            projected_embeddings = projection_layer(clip_embeddings)
            
            loss = 1 - cos_sim(projected_embeddings, phi_embeddings).mean()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {total_loss/len(dataloader):.4f}")
        print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    torch.save(projection_layer.state_dict(), 'trained_projection_layer.pth')
    print("Projection layer training complete!")

if __name__ == "__main__":
    main()

Epoch 1/5: 100%|██████████| 4929/4929 [00:54<00:00, 90.35it/s] 


Epoch 1/5, Avg Loss: 0.0209
GPU Memory: 2.97 GB


Epoch 2/5: 100%|██████████| 4929/4929 [00:52<00:00, 94.42it/s] 


Epoch 2/5, Avg Loss: -0.0000
GPU Memory: 2.97 GB


Epoch 3/5: 100%|██████████| 4929/4929 [00:54<00:00, 91.19it/s] 


Epoch 3/5, Avg Loss: -0.0002
GPU Memory: 2.97 GB


Epoch 4/5: 100%|██████████| 4929/4929 [00:52<00:00, 93.27it/s] 


Epoch 4/5, Avg Loss: -0.0002
GPU Memory: 2.97 GB


Epoch 5/5: 100%|██████████| 4929/4929 [00:50<00:00, 96.82it/s] 

Epoch 5/5, Avg Loss: -0.0002
GPU Memory: 2.97 GB
Projection layer training complete!





In [17]:
import torch
import torch.nn as nn

# Define the ProjectionLayer class
class ProjectionLayer(nn.Module):
    def __init__(self, clip_dim=512, phi_dim=2048):  # Adjust phi_dim if necessary
        super().__init__()
        self.projection = nn.Linear(clip_dim, phi_dim)

    def forward(self, x):
        return self.projection(x)

# Create an instance of the ProjectionLayer
projection_layer = ProjectionLayer()

# Load the state dict
state_dict = torch.load('trained_projection_layer.pth', weights_only=True)
projection_layer.load_state_dict(state_dict)

# Test it with a dummy input
dummy_clip_embedding = torch.randn(1, 512)  # Assuming CLIP embeddings are 512-dimensional
projected = projection_layer(dummy_clip_embedding)

print("Projection layer loaded successfully")
print(f"Input shape: {dummy_clip_embedding.shape}")
print(f"Output shape: {projected.shape}")

Projection layer loaded successfully
Input shape: torch.Size([1, 512])
Output shape: torch.Size([1, 2048])


In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer

phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

print("Phi model loaded successfully")

Phi model loaded successfully


In [19]:
import torch.nn as nn

class SimpleMultiModalPhi(nn.Module):
    def __init__(self, phi_model, projection_layer):
        super().__init__()
        self.phi = phi_model
        self.projection = projection_layer
    
    def forward(self, text_input, image_embedding=None):
        if image_embedding is not None:
            projected_image = self.projection(image_embedding)
            text_embeds = self.phi.get_input_embeddings()(text_input)
            combined_input = torch.cat([projected_image.unsqueeze(1), text_embeds], dim=1)
            output = self.phi(inputs_embeds=combined_input)
        else:
            output = self.phi(text_input)
        return output

combined_model = SimpleMultiModalPhi(phi_model, projection_layer)
print("Combined model created successfully")

Combined model created successfully


In [20]:
# Test with text-only input
text = "Hello, world!"
text_input = tokenizer(text, return_tensors="pt")['input_ids']
output = combined_model(text_input)
print("Text-only output shape:", output.logits.shape)

# Test with text + image input
dummy_image_embedding = torch.randn(1, 512)
output = combined_model(text_input, dummy_image_embedding)
print("Text + Image output shape:", output.logits.shape)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Text-only output shape: torch.Size([1, 4, 51200])
Text + Image output shape: torch.Size([1, 5, 51200])


In [None]:
from peft import LoraConfig, get_peft_model

def add_qlora_adapter(model):
    config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["query_key_value"],  # Adjust based on Phi model's architecture
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    return get_peft_model(model, config)

# Apply QLoRA to your combined model
qlora_model = add_qlora_adapter(combined_model)

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader

# Load LLaVA-Instruct-150K dataset
dataset = load_dataset("liuhaotian/LLaVA-Instruct-150K")

# Implement a custom dataset class to handle both text and image embeddings
class MultiModalDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, clip_embeddings, tokenizer):
        self.dataset = dataset
        self.clip_embeddings = clip_embeddings
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = f"Human: {item['conversations'][0]['value']}\nAssistant: {item['conversations'][1]['value']}"
        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        image_embedding = self.clip_embeddings[idx]
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "image_embedding": torch.tensor(image_embedding),
            "labels": inputs["input_ids"].squeeze()
        }

# Create dataset and dataloader
train_dataset = MultiModalDataset(dataset["train"], clip_embeddings, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Training loop
optimizer = torch.optim.AdamW(qlora_model.parameters(), lr=1e-5)

for epoch in range(3):  # Adjust number of epochs as needed
    for batch in tqdm(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        image_embeddings = batch["image_embedding"].to(device)
        labels = batch["labels"].to(device)

        outputs = qlora_model(input_ids=input_ids, attention_mask=attention_mask, image_embedding=image_embeddings, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} completed")

# Save the fine-tuned model
qlora_model.save_pretrained("path_to_save_model")