CREATE SEPERATE 10K IMAGE TRAIN DATA

CREATE JSON FILE

In [None]:
import csv
import json

def csv_to_json(csv_file, json_file):
    # List to hold the final JSON data
    json_data = []

    # Open and read the CSV file
    with open(csv_file, newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        
        # Loop through each row in the CSV file
        for row in reader:
            index = row['index']
            image_path = row['image_path']
            user_input = row['user_input']
            # output = row['output']

            # Create the structure as per the format provided
            message = {
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": image_path
                            },
                            {
                                "type": "text",
                                "text": user_input
                            }
                        ]
                    }
                    # ,
                    # {
                    #     "role": "assistant",
                    #     "content": [
                    #         {
                    #             "type": "text",
                    #             "text": output
                    #         }
                    #     ]
                    # }
                ]
            }
            
            # Append to the JSON data list
            json_data.append(message)

    # Write the data to a JSON file
    with open(json_file, 'w', encoding='utf-8') as jsonfile:
        json.dump(json_data, jsonfile, ensure_ascii=False, indent=4)

# File paths for input CSV and output JSON
csv_file_path = 'test_data.csv'
json_file_path = 'output_test.json'

# Call the conversion function
csv_to_json(csv_file_path, json_file_path)

print(f"JSON data has been written to {json_file_path}")


In [None]:
%pip install accelerate==0.34.0 \
attrs==24.2.0 \
av==13.0.0 \
certifi==2024.8.30 \
charset-normalizer==3.3.2 \
einops==0.8.0 \
fancycompleter==0.9.1 \
filelock==3.15.4 \
flash-attn==2.6.3 \
fsspec==2024.9.0 \
huggingface-hub==0.24.6 \
idna==3.8 \
Jinja2==3.1.4 \
MarkupSafe==2.1.5 \
mpmath==1.3.0 \
networkx==3.3 \
numpy==2.1.1 \
nvidia-cublas-cu12==12.1.3.1 \
nvidia-cuda-cupti-cu12==12.1.105 \
nvidia-cuda-nvrtc-cu12==12.1.105 \
nvidia-cuda-runtime-cu12==12.1.105 \
nvidia-cudnn-cu12==9.1.0.70 \
nvidia-cufft-cu12==11.0.2.54 \
nvidia-curand-cu12==10.3.2.106 \
nvidia-cusolver-cu12==11.4.5.107 \
nvidia-cusparse-cu12==12.1.0.106 \
nvidia-nccl-cu12==2.20.5 \
nvidia-nvjitlink-cu12==12.6.68 \
nvidia-nvtx-cu12==12.1.105 \
packaging==24.1 \
pdbpp==0.10.3 \
pillow==10.4.0 \
psutil==6.0.0 \
Pygments==2.18.0 \
pyrepl==0.9.0 \
PyYAML==6.0.2 \
regex==2024.7.24 \
requests==2.32.3 \
safetensors==0.4.5 \
socksio==1.0.0 \
sympy==1.13.2 \
tokenizers==0.19.1 \
torch==2.4.1 \
torchvision==0.19.1 \
tqdm==4.66.5 \
triton==3.0.0 \
typing_extensions==4.12.2 \
urllib3==2.2.2 \
wmctrl==0.5 \
git+https://github.com/huggingface/transformers@98f5463fee0009538eb1ca9f7e466c6145e3a731


In [None]:
pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate


In [None]:
import torch
import json
import datetime
import os
from PIL import Image

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from torch.utils.data import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from functools import partial
from util.vision_util import process_vision_info
from util.logutil import init_logger, get_logger

output_dir = f'./train_output/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}/'
init_logger(output_dir)
logger = get_logger()

device = "cuda:0"

class ToyDataSet(Dataset):
    def __init__(self, data_path):
        super().__init__()
        with open(data_path, "r") as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        images = []
        try:
            # Extract image paths from the messages
            for message in item.get('messages', []):
                if message.get('role') == 'user':
                    for content in message.get('content', []):
                        if content.get('type') == 'image':
                            image_path = content.get('image')
                            if image_path and os.path.isfile(image_path):
                                images.append(image_path)
                            else:
                                logger.warning(f"Image file {image_path} not found. Skipping this item.")
                                return None
        except KeyError as e:
            logger.error(f"Missing key {e} in data item at index {idx}. Skipping this item.")
            return None
        
        item['images'] = images
        return item

def find_assistant_content_sublist_indexes(l):
    start_indexes = []
    end_indexes = []

    for i in range(len(l) - 1):
        if l[i] == 151644 and l[i + 1] == 77091:
            start_indexes.append(i)
            for j in range(i + 2, len(l)):
                if l[j] == 151645:
                    end_indexes.append(j)
                    break

    return list(zip(start_indexes, end_indexes))

def collate_fn(batch, processor, device):
    batch = [item for item in batch if item is not None]  # Remove None items

    if not batch:
        logger.warning("Empty batch encountered after filtering out None items. Skipping this batch.")
        return None, None

    try:
        messages = [m['messages'] for m in batch]
        texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) for msg in messages]
        
        # Process images and videos
        image_paths = [item['images'] for item in batch]
        image_inputs = []
        video_inputs = []  # Assuming video_inputs are not used in this context

        for paths in image_paths:
            images = []
            for path in paths:
                try:
                    # Load and preprocess image
                    image = Image.open(path).convert("RGB")
                    images.append(image)
                except Exception as e:
                    logger.error(f"Error loading image {path}: {e}")
            image_inputs.append(images)
        
        # Process vision info
        try:
            image_inputs, video_inputs = process_vision_info(messages)
        except Exception as e:
            logger.error(f"Error processing vision info: {e}")
            return None, None

        inputs = processor(
            text=texts,
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to(device)

        input_ids_lists = inputs['input_ids'].tolist()
        assert len(messages) == len(input_ids_lists)

        labels_list = []
        for ids_list in input_ids_lists:
            label_ids = [-100] * len(ids_list)
            for begin_end_indexs in find_assistant_content_sublist_indexes(ids_list):
                label_ids[begin_end_indexs[0]+2:begin_end_indexs[1]+1] = ids_list[begin_end_indexs[0]+2:begin_end_indexs[1]+1]
            labels_list.append(label_ids)

        labels_ids = torch.tensor(labels_list, dtype=torch.int64)

        return inputs, labels_ids
    
    except Exception as e:
        logger.error(f"Error processing batch: {e}")
        return None, None

def write_chat_template(processor, output_dir):
    output_chat_template_file = os.path.join(output_dir, "chat_template.json")
    chat_template_json_string = json.dumps({"chat_template": processor.chat_template}, indent=2, sort_keys=True) + "\n"
    with open(output_chat_template_file, "w", encoding="utf-8") as writer:
        writer.write(chat_template_json_string)
        logger.info(f"Chat template saved in {output_chat_template_file}")

def train():
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )

    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=256*28*28, max_pixels=512*28*28, padding_side="right")

    train_loader = DataLoader(
        ToyDataSet("output.json"),
        batch_size=3,
        collate_fn=partial(collate_fn, processor=processor, device=device),
        drop_last=True  # Ensure that incomplete batches are dropped
    )

    model.train()
    epochs = 2
    optimizer = AdamW(model.parameters(), lr=1e-5)
    NUM_ACCUMULATION_STEPS = 2
    for epoch in range(epochs):
        accumulated_avg_loss = 0
        steps = 0
        for batch in train_loader:
            if batch is None:
                continue  # Skip processing if batch is None
            
            try:
                inputs, labels = batch
                if inputs is None or labels is None:
                    logger.warning("Skipping batch due to None inputs or labels.")
                    continue
                
                outputs = model(**inputs, labels=labels)
                
                loss = outputs.loss / NUM_ACCUMULATION_STEPS
                accumulated_avg_loss += loss.item()
                loss.backward()
                
                if steps % NUM_ACCUMULATION_STEPS == 0:
                    logger.info(f"Batch {steps} of epoch {epoch + 1}/{epochs}, average training loss of previous {NUM_ACCUMULATION_STEPS} batches: {accumulated_avg_loss}")
                    accumulated_avg_loss = 0
                    optimizer.step()
                    optimizer.zero_grad()

                # Free GPU memory
                torch.cuda.empty_cache()
                
            except Exception as e:
                logger.error(f"Error processing batch {steps}: {e}")

            steps += 1

    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)
    write_chat_template(processor, output_dir)

if __name__ == "__main__":
    train()


In [None]:
import zipfile
import os

def zip_folder(folder_name, output_name):
    with zipfile.ZipFile(output_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_name):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_name)
                zipf.write(file_path, arcname)

zip_folder('./train_output/20240915191425', 'train_output.zip')
