In [10]:
!git clone https://github.com/rakibulnahin/nlp_project.git

Cloning into 'nlp_project'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 82 (delta 6), reused 80 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 6.11 MiB | 28.32 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [2]:
!rm -rf /kaggle/working/*

In [3]:
%%capture
!pip install -U transformers accelerate peft bitsandbytes datasets


In [4]:
import huggingface_hub
token = "token"
huggingface_hub.login(token)

In [5]:
import torch
from torch import nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import (
    CLIPVisionModel, CLIPImageProcessor,
    LlamaForCausalLM, LlamaTokenizer,
    AutoProcessor, TrainingArguments,
    Trainer, AutoTokenizer
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from PIL import Image
import pandas as pd
from transformers.utils import logging
import os
logging.set_verbosity_error()  # Suppress too much logging
from tqdm import tqdm


2025-06-17 01:17:46.336117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750123066.547514      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750123066.609128      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Augmentation

In [16]:
# ----- Augmentaion------
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),       # Randomly crop and resize to 224x224
    transforms.RandomHorizontalFlip(),      # Randomly flip the image horizontally
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # Randomly change brightness, etc.
    # Add more transforms as needed, e.g., transforms.RandomRotation, transforms.GaussianBlur
    transforms.RandomGrayscale(p=0.5),
    transforms.ToTensor(), # Convert PIL Image to Tensor (C, H, W)
    # The image_processor will handle the final normalization based on its model's requirements
    # If your image_processor *doesn't* handle normalization, you'd add it here:
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize(256),             # Resize the smaller edge to 256
    transforms.CenterCrop(224),         # Crop the center to 224x224
    transforms.ToTensor(),              # Convert PIL Image to Tensor
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # If not handled by processor
])

# ---- CONFIG ----
image_encoder_name = "openai/clip-vit-base-patch32"
llm_name = "meta-llama/Llama-2-7b-chat-hf"
confident_pth = "/kaggle/working/nlp_project/my_dataset/confident_feedback.csv"
scared_pth = "/kaggle/working/nlp_project/my_dataset/scared_feedback.csv"
image_root = "/kaggle/working/nlp_project/my_dataset"
max_length = 128
batch_size = 4
num_epochs = 10
lr = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Vision Encoder

In [7]:
# ---- Load Vision Encoder ----
vision_encoder = CLIPVisionModel.from_pretrained(image_encoder_name).to(device)
vision_encoder.eval().requires_grad_(False)
vision_processor = CLIPImageProcessor.from_pretrained(image_encoder_name)

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

# Llama Tokenizer + Model

In [8]:
# ---- Load Tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.pad_token = tokenizer.eos_token

# ---- Load LLaMA Language Model with LoRA + 4-bit ----
language_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    load_in_4bit=True
)
language_model.gradient_checkpointing_enable()
language_model = prepare_model_for_kbit_training(language_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
language_model = get_peft_model(language_model, lora_config)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Dataset

In [17]:
# ---- Dataset ----
class ImageTextDataset(Dataset):
    def __init__(self, csv_path, emotion, image_processor, tokenizer, max_length=128, transformer=None):
        self.data = pd.read_csv(csv_path)
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.emotion = emotion
        self.transformer = transformer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(image_root, row['image'])
        image = Image.open(image_path).convert("RGB")

        if self.transformer:
            image = self.transformer(image)

        image_tensor = self.image_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

        full_label = f'you look {self.emotion}. {row["feedback"]}'
        label_encoding = self.tokenizer(full_label, padding='max_length', truncation=True,
                                        max_length=self.max_length, return_tensors='pt')

        return {
            "pixel_values": image_tensor,
            "input_ids": label_encoding["input_ids"].squeeze(0),
            "attention_mask": label_encoding["attention_mask"].squeeze(0)
        }

def custom_collate(features):
    return {
        "pixel_values": torch.stack([f["pixel_values"] for f in features]),
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features])
    }

# ---- Load Data ----
scared_dataset = ImageTextDataset(scared_pth, "scared", vision_processor, tokenizer, max_length, train_transform)
confident_dataset = ImageTextDataset(confident_pth, "confident", vision_processor, tokenizer, max_length, train_transform)
dataset = ConcatDataset([scared_dataset, confident_dataset, scared_dataset, confident_dataset, scared_dataset, confident_dataset])
print(len(dataset))
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

165


# Training

In [18]:


# ---- Projector ----
projector = nn.Sequential(
    nn.Linear(vision_encoder.config.hidden_size, language_model.config.hidden_size),
    nn.Tanh()
).to(device)

optimizer = torch.optim.AdamW(list(language_model.parameters()) + list(projector.parameters()), lr=lr)

# ---- Training Loop ----
language_model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            vision_outputs = vision_encoder(pixel_values=pixel_values)
            image_embeds = vision_outputs.last_hidden_state.mean(dim=1)

        prefix_embeds = projector(image_embeds).unsqueeze(1)
        text_embeds = language_model.model.model.embed_tokens(input_ids)
        inputs_embeds = torch.cat([prefix_embeds, text_embeds], dim=1)

        # Adjust attention and labels
        prefix_attention = torch.ones((input_ids.size(0), 1), device=device)
        combined_attention_mask = torch.cat([prefix_attention, attention_mask], dim=1)

        labels = input_ids.clone()
        labels = torch.cat([torch.full((labels.size(0), 1), -100, dtype=torch.long, device=device), labels[:, :-1]], dim=1)

        # Truncate to max length
        inputs_embeds = inputs_embeds[:, :max_length, :]
        combined_attention_mask = combined_attention_mask[:, :max_length]
        labels = labels[:, :max_length]

        outputs = language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=combined_attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    print(f"✅ Epoch {epoch+1} complete | Avg Loss: {running_loss / len(train_loader):.4f}")


  return fn(*args, **kwargs)
Epoch 1/10: 100%|██████████| 42/42 [04:05<00:00,  5.86s/it, loss=1.62] 


✅ Epoch 1 complete | Avg Loss: 5.3453


Epoch 2/10: 100%|██████████| 42/42 [04:10<00:00,  5.96s/it, loss=0.664]


✅ Epoch 2 complete | Avg Loss: 0.8710


Epoch 3/10: 100%|██████████| 42/42 [04:12<00:00,  6.01s/it, loss=0.665]


✅ Epoch 3 complete | Avg Loss: 0.7818


Epoch 4/10: 100%|██████████| 42/42 [04:16<00:00,  6.10s/it, loss=0.675]


✅ Epoch 4 complete | Avg Loss: 0.6949


Epoch 5/10: 100%|██████████| 42/42 [04:16<00:00,  6.10s/it, loss=0.62] 


✅ Epoch 5 complete | Avg Loss: 0.5974


Epoch 6/10: 100%|██████████| 42/42 [04:14<00:00,  6.06s/it, loss=0.416]


✅ Epoch 6 complete | Avg Loss: 0.4999


Epoch 7/10: 100%|██████████| 42/42 [04:11<00:00,  5.99s/it, loss=0.331]


✅ Epoch 7 complete | Avg Loss: 0.4322


Epoch 8/10: 100%|██████████| 42/42 [04:11<00:00,  5.98s/it, loss=0.333]


✅ Epoch 8 complete | Avg Loss: 0.3817


Epoch 9/10: 100%|██████████| 42/42 [04:11<00:00,  5.98s/it, loss=0.369]


✅ Epoch 9 complete | Avg Loss: 0.3319


Epoch 10/10: 100%|██████████| 42/42 [04:11<00:00,  5.98s/it, loss=0.306]

✅ Epoch 10 complete | Avg Loss: 0.2804





# Generate Feedback

In [23]:
def generate_feedback(image_path, max_new_tokens=30):
    image = Image.open(image_path).convert("RGB")
    image = val_transform(image)
    pixel_value = vision_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0).to(device)
    vision_encoder.eval()
    language_model.eval()

    with torch.no_grad():
        vision_feat = vision_encoder(pixel_value.unsqueeze(0)).last_hidden_state.mean(dim=1)
        prefix_embed = projector(vision_feat).unsqueeze(1)

        # --- FIX STARTS HERE ---
        # Prime the model with the expected starting phrase, e.g., "you look"
        # The model will then try to complete this based on the image embedding
        # You might need to experiment with what exact phrase works best.
        # "you look" implies it should follow with an emotion.
        # If you want it to directly start with feedback, you'd need to train it that way.
        prompt_text = "you look" # Or "The person looks" depending on your desired output
        input_ids = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
        # Add special tokens=False to avoid adding <s> again if you're concatenating.
        # If you want <s> at the very beginning of the whole sequence, ensure your training includes it,
        # otherwise, keep add_special_tokens=True for the first tokenized part.
        # For simplicity, let's assume we want "you look" to be the very start of the text part.

        token_embed = language_model.model.model.embed_tokens(input_ids)
        input_embed = torch.cat([prefix_embed, token_embed], dim=1)

        output_ids = language_model.generate(
            inputs_embeds=input_embed,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=1.0,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the output, starting after the initial prompt you provided.
    # The output_ids will include the tokens for "you look" that you fed in.
    # So, you need to skip those when decoding or use a slice if you generated from the very beginning.
    # A simpler way is to just decode everything and then clean it up.
    # For now, let's decode the whole thing and strip the prompt, if it's included.
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # If the generation includes the prompt text, you might want to remove it from the beginning
    # For example, if output is "you look confident. Keep going!", and your prompt was "you look",
    # you might want to ensure the prompt part isn't duplicated if the model generates it.
    # However, for now, the model will likely continue from where "you look" leaves off.
    return decoded_output


# --- Example Usage ---
test_image = "/kaggle/working/nlp_project/my_dataset/scared/21.png"  # Replace with your image path
result = generate_feedback(test_image)
print("\n🧠 Generated Feedback:")
print(result)


🧠 Generated Feedback:
surely. you look scared. let us talk about something else.


In [24]:
import os


In [26]:
path = "/kaggle/working/nlp_project/my_dataset/confident"
images = os.listdir(path)
responses = []
for image in images:
    result = generate_feedback(path+"/"+image)
    responses.append(result)

In [27]:
responses

['hopefully. you look strong.',
 '✿',
 'You look concerned.Юный добры, горазда..',
 'Википедии',
 'you look scared, be brave.Љ',
 "lets do something that scares you, but doesn't make you any less confident.",
 "everyone looks worried right now., you are doing the best you can, let's trust in the process you're trying your best",
 'you look scared.and worried.now take a deep breath.',
 'you look scared.’ you do not need to be scared.',
 'nobody is perfect. you are doing a great job.',
 'nobody is perfect. you are doing great.',
 'gaben',
 'nobody is perfect. a good start counts, too.',
 'его. you look great.',
 'nobody is perfect. Your contribution is valuable.',
 "држа, relax, don't be nervous, just be yourself, confidence is being sure of yourself",
 'ätz, a balanced heart, you are not alone.Љ',
 'you look scared.',
 'државе',
 "you look worry if you can't relax, you don",
 "hopefully together.you look scared.let's keep going.",
 'lets how do you feel?',
 'gaben',
 'you look scared. Y

In [28]:
path = "/kaggle/working/nlp_project/my_dataset/scared"
images = os.listdir(path)
responses = []
for image in images:
    print(image)
    result = generate_feedback(path+"/"+image)
    responses.append(result)

7.png
5.png
14.png
9.png
13.png
6.png
28.png
17.png
2.png
1.png
4.png
26.png
20.png
3.png
15.png
24.png
18.png
10.png
21.png
29.png
22.png
16.png
8.png
25.png
19.png
27.png
23.png
12.png
11.png


In [29]:
responses

["you look scared. You look worried. It's okay to take a break. Take a moment to gather yourself.",
 'lets you in.injustice does not define you.when did you last laugh?',
 'hopefully your own efforts are valuable.Ще глобал за стелом.',
 'его. You look great.',
 'броја',
 'округу, just relax.com.',
 'everybody looks tired.it is okay to feel the way you feel',
 'nobody pretends perfection',
 'nobody is perfect. you are doing the best you can.',
 'ätz\nJune 27, 2021 · 10:44 AM',
 'савез',
 'ätz lang double quote',
 'You look scared.Learn more about yourself and your goals with a psychology visit.',
 'you look scared.you look concerned.you look worried.you look unsure.',
 'You look scared.գ  You look worried. Try to focus on your breath for a few moments and let go of the t',
 "You look scared. Cynical or sarcastic answers aren't the right ones. Keep it real.",
 'ätz lang double quarter long',
 'савез',
 'nobody is perfect. You are doing a great job.',
 'hopefully, trust yourself',
 'you l

In [None]:
# Save the LoRA-adapted LLaMA model
language_model.save_pretrained("/content/saved_model/vlm_lora_llama2")

# Save the projector layer separately
torch.save(projector.state_dict(), "/content/vlm_projector.pt")


In [None]:
from peft import PeftModel

# Load LLaMA base in 4-bit and apply LoRA weights
base_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_4bit=True,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, "/content/vlm_lora_llama2").eval()

# Load projector
projector = nn.Sequential(
    nn.Linear(vision_encoder.config.hidden_size, model.config.hidden_size),
    nn.Tanh()
).to(device)
projector.load_state_dict(torch.load("/content/vlm_projector.pt", map_location=device))
