In [1]:
pip install jsonlines


Collecting jsonlines
  Using cached jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Using cached jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import jsonlines
import numpy as np
from PIL import Image
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW

In [8]:
model_id = "vikhyatk/moondream2"
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)


PhiForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


: 

In [4]:
DESCRIPTIONS_PATH = r"C:\Users\shrey\Downloads\mini_project\docci\docci_descriptions.jsonlines"
METADATA_PATH = r"C:\Users\shrey\Downloads\mini_project\docci\docci_metadata.jsonlines"
IMAGES_DIR = r"C:\Users\shrey\Downloads\mini_project\docci\images"

In [5]:
class DOCCIDataset(Dataset):
    def __init__(self, descriptions_path, metadata_path, images_dir, tokenizer, transform=None):
        self.images_dir = images_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.data = []

        # Load descriptions
        with jsonlines.open(descriptions_path, 'r') as reader:
            for line in reader:
                self.data.append(line)

        # Load metadata (if needed)
        with jsonlines.open(metadata_path, 'r') as reader:
            self.metadata = {line["example_id"]: line for line in reader}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        image_path = os.path.join(self.images_dir, sample["image_file"])
        
        # Load image
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load question-answer pair
        question = sample['description']  # Assuming question is in the description
        answer = self.metadata[sample['example_id']]['answer']  # Assuming answer in metadata

        return {'image': image, 'question': question, 'answer': answer}


In [6]:
from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 (adjust as needed)
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
train_dataset = DOCCIDataset(DESCRIPTIONS_PATH, METADATA_PATH, IMAGES_DIR, tokenizer, transform)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
def evaluate_model(model, dataset, tokenizer, num_samples=100):
    correct_predictions = 0
    total_predictions = 0
    bleu_scores = []

    for i, sample in enumerate(dataset):
        if i >= num_samples:
            break

        question = sample['question']
        true_answer = sample['answer']

        # Predict answer using the model
        predicted_answer = model.answer_question(
            model.encode_image(sample['image']),
            question,
            tokenizer=tokenizer,
            num_beams=4,
            no_repeat_ngram_size=5,
            early_stopping=True
        )

        # Calculate accuracy
        if predicted_answer.strip().lower() == true_answer.strip().lower():
            correct_predictions += 1
        total_predictions += 1

        # Calculate BLEU score
        bleu_scores.append(sentence_bleu([true_answer.split()], predicted_answer.split()))

    # Average metrics
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    avg_bleu = np.mean(bleu_scores)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Average BLEU Score: {avg_bleu:.2f}")
    return accuracy, avg_bleu


In [None]:
baseline_accuracy, baseline_bleu = evaluate_model(model, train_loader, tokenizer)


In [None]:
training_losses = []
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        images = batch['image']
        questions = batch['question']
        answers = batch['answer']

        # Encode images and tokenize questions
        encoded_images = model.encode_image(images)
        tokenized_questions = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")

        # Forward pass
        outputs = model(encoded_images, tokenized_questions)
        loss = outputs.loss
        training_losses.append(loss.item())

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{num_epochs} completed with loss: {loss.item()}")


In [None]:
fine_tuned_accuracy, fine_tuned_bleu = evaluate_model(model, train_loader, tokenizer)


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(training_losses, label="Training Loss")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.title("Training Loss Over Time")
plt.legend()
plt.show()

In [None]:
metrics = ['Accuracy', 'BLEU Score']
baseline_scores = [baseline_accuracy, baseline_bleu]
fine_tuned_scores = [fine_tuned_accuracy, fine_tuned_bleu]

x = np.arange(len(metrics))
width = 0.35


fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - width/2, baseline_scores, width, label='Before Fine-Tuning')
ax.bar(x + width/2, fine_tuned_scores, width, label='After Fine-Tuning')

ax.set_xlabel('Metrics')
ax.set_title('Model Performance Before and After Fine-Tuning')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

plt.show()