In [2]:
ABL_NUM = 0
ABL = "abl"+str(ABL_NUM)
model_name = "Salesforce/blip2-opt-2.7b"
model_save_path = f"Finetuned_{ABL}"
print(f"Currently Running Ablation {ABL_NUM} on Model {model_name}")

Currently Running Ablation 0 on Model Salesforce/blip2-opt-2.7b


In [4]:

import os
import pickle
import requests
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
processor = AutoProcessor.from_pretrained(model_name)




In [3]:

model = AutoModelForVisualQuestionAnswering.from_pretrained(model_name,torch_dtype=torch.float16)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
torch.cuda.empty_cache()
torch.manual_seed(42)
# config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     lora_dropout=0.05,
#     bias="none",
#     target_modules=["q_proj", "k_proj"]
# )
# model = get_peft_model(model, config)
# model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<torch._C.Generator at 0x15551aa0b6b0>

In [4]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, ablation):
        self.dataset = dataset
        self.processor = processor
        self.ablation = ablation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        emotion = self.dataset[idx]['emotion']
        sentiment = self.dataset[idx]['sentiment']
        sarcasm = self.dataset[idx]['sarcasm']
        if self.ablation == 'abl0':
            question = f"What complaint is conveyed by the disappointed and frustrated user in the video?"
        elif self.ablation == 'abl1':  # Emotion only
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed?"
        elif self.ablation == 'abl2':  # Sentiment only
            question = f"What complaint is conveyed by the user in the video, considering the {sentiment} sentiment?"
        elif self.ablation == 'abl3':  # Sarcasm only
            question = f"What complaint is conveyed by the user in the video, noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl4':  # Emotion + Sentiment
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and the {sentiment} sentiment?"
        elif self.ablation == 'abl5':  # Sentiment + Sarcasm
            question = f"What complaint is conveyed by the user in the video, considering the {sentiment} sentiment and noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl6':  # Emotion + Sarcasm
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl7':  # All
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and the {sentiment} sentiment. Additionally, note if the complaint is being conveyed in a {sarcasm} sarcastic manner."
        else:
            raise ValueError("Invalid ablation type. Choose from abl1 to abl7.")
        answer = self.dataset[idx]['answer']
        image = self.dataset[idx]['image']
        if isinstance(image, Image.Image):
            image = image.convert("RGB")
        else:
            image = Image.open(image).convert("RGB")
        encoding = self.processor( image, question, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        labels = self.processor.tokenizer.encode( answer, max_length=256, padding="max_length", truncation=True, return_tensors='pt')

        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding
dataset = load_dataset("cerelac2/consumer-complaint-vqa")
train_dataset = dataset['train']
split_datasets = train_dataset.train_test_split(test_size=0.05)
train_split = split_datasets['train']
valid_split = split_datasets['test']
train_dataset = VQADataset(dataset=train_split, processor=processor, ablation=ABL)
valid_dataset = VQADataset(dataset=valid_split, processor=processor, ablation=ABL)
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [10]:
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)
num_epochs = 4
patience = 10
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Training epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_masked = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()
        scaler.scale(loss).backward()
        # scaler.step(optimizer)
        # scaler.update()
        optimizer.zero_grad()
    model.eval()
    eval_loss = 0
    for batch in tqdm(valid_dataloader, desc=f"Validating epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_masked = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            eval_loss += loss.item()
    tracking_information.append((epoch_loss / len(train_dataloader), eval_loss / len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    print(f"Epoch {epoch+1} - Training loss: {epoch_loss / len(train_dataloader)} - Eval Loss: {eval_loss / len(valid_dataloader)} - LR: {optimizer.param_groups[0]['lr']}")
    scheduler.step()
    if eval_loss < min_eval_loss:
        model.save_pretrained(model_save_path)
        print(f"Saved model to {model_save_path}")
        min_eval_loss = eval_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered")
            break
    torch.cuda.empty_cache()

pickle.dump(tracking_information, open(f"tracking_information_{ABL}.pkl", "wb"))
print("Finetuning complete!")

Training epoch 1: 100%|██████████| 161/161 [01:30<00:00,  1.78it/s]
Validating epoch 1: 100%|██████████| 9/9 [00:03<00:00,  2.92it/s]


Epoch 1 - Training loss: 7.676483302382949 - Eval Loss: 7.620656119452582 - LR: 4e-05
Saved model to Finetuned_abl0


Training epoch 2: 100%|██████████| 161/161 [01:29<00:00,  1.79it/s]
Validating epoch 2: 100%|██████████| 9/9 [00:02<00:00,  3.21it/s]


Epoch 2 - Training loss: 7.673102171524711 - Eval Loss: 7.620656119452582 - LR: 3.6e-05


Training epoch 3: 100%|██████████| 161/161 [01:30<00:00,  1.78it/s]
Validating epoch 3: 100%|██████████| 9/9 [00:02<00:00,  3.18it/s]


Epoch 3 - Training loss: 7.660724995299156 - Eval Loss: 7.620656119452582 - LR: 3.24e-05


Training epoch 4: 100%|██████████| 161/161 [01:30<00:00,  1.78it/s]
Validating epoch 4: 100%|██████████| 9/9 [00:02<00:00,  3.25it/s]


Epoch 4 - Training loss: 7.6649066232006 - Eval Loss: 7.620656119452582 - LR: 2.9160000000000002e-05
Finetuning complete!


In [9]:
import csv
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from transformers import BlipForQuestionAnswering
dataset = load_dataset("cerelac2/consumer-complaint-vqa")

class VQATestDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, ablation):
        self.dataset = dataset
        self.processor = processor
        self.ablation = ablation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        video_id = self.dataset[idx]['ID']
        emotion = self.dataset[idx]['emotion']
        sentiment = self.dataset[idx]['sentiment']
        sarcasm = self.dataset[idx]['sarcasm']
        if self.ablation == 'abl0':
            question = f"What complaint is conveyed by the disappointed and frustrated user in the video?"
        elif self.ablation == 'abl1':  # Emotion only
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed?"
        elif self.ablation == 'abl2':  # Sentiment only
            question = f"What complaint is conveyed by the user in the video, considering the {sentiment} sentiment?"
        elif self.ablation == 'abl3':  # Sarcasm only
            question = f"What complaint is conveyed by the user in the video, noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl4':  # Emotion + Sentiment
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and the {sentiment} sentiment?"
        elif self.ablation == 'abl5':  # Sentiment + Sarcasm
            question = f"What complaint is conveyed by the user in the video, considering the {sentiment} sentiment and noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl6':  # Emotion + Sarcasm
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and noting if it is conveyed in a {sarcasm} sarcastic manner?"
        elif self.ablation == 'abl7':  # All
            question = f"What complaint is conveyed by the user in the video, considering the {emotion} expressed and the {sentiment} sentiment. Additionally, note if the complaint is being conveyed in a {sarcasm} sarcastic manner."
        else:
            raise ValueError("Invalid ablation type. Choose from abl1 to abl7.")

        answer = self.dataset[idx]['answer']
        image = self.dataset[idx]['image']
        
        # Load the image if it's a path
        if isinstance(image, Image.Image):
            image = image.convert("RGB")
        else:
            image = Image.open(image).convert("RGB")
        
        encoding = self.processor(images=image, text=question, return_tensors="pt", padding="max_length", truncation=True,max_length=128)
        
        return encoding, answer, video_id

# Initialize dataset, model, and evaluation
test_dataset = VQATestDataset(dataset=dataset['test'], processor=processor, ablation=ABL)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)  # Set batch size according to your needs
finetuned_model =  AutoModelForVisualQuestionAnswering.from_pretrained(model_save_path)
finetuned_model.to("cuda")

predicted_answers = []
ground_truths = []
video_ids = []

print("Evaluating on Test Set:")
finetuned_model.eval()  # Set model to evaluation mode

with torch.no_grad():
    for batch in tqdm(test_loader):
        encodings, answers, ids = batch
        inputs = {k: v.squeeze(1).to(finetuned_model.device) for k, v in encodings.items()}
        outputs = finetuned_model.generate(**inputs,max_length=51)
        for i, output in enumerate(outputs):
            predicted_answer = processor.decode(output, skip_special_tokens=True)
            predicted_answers.append(predicted_answer)
            ground_truths.append(answers[i])
            video_ids.append(ids[i])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating on Test Set:


  0%|          | 0/16 [00:00<?, ?it/s]


ValueError: Input length of input_ids is 0, but `max_length` is set to -78. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [6]:
csv_filename = f"blip2_{ABL}_all_results.csv"

with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["video_id", "predicted_answer", "ground_truth"])
    
    for video_id, pred, gt in zip(video_ids, predicted_answers, ground_truths):
        writer.writerow([video_id, pred, gt])

print(f"Results saved to {csv_filename}")

Results saved to blip2_abl0_all_results.csv


In [7]:
import csv
import pandas as pd
import evaluate
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from textstat import textstat
from nltk.translate.meteor_score import meteor_score
from moverscore import get_idf_dict, word_mover_score
import torch
import numpy as np
input_csv = f"blip2_{ABL}_all_results.csv"
data = pd.read_csv(input_csv)
predicted_answers = data["predicted_answer"].tolist()
ground_truths = data["ground_truth"].tolist()
video_ids = data["video_id"].tolist()

# Load the models and metrics
rouge_metric = evaluate.load("rouge")
bert_score_metric = evaluate.load("bertscore")
meteor_metric = evaluate.load("meteor")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define helper functions
def calculate_bleu(pred, ref):
    smoothing = SmoothingFunction().method1
    return [
        sentence_bleu([ref], pred, weights=(1, 0, 0, 0), smoothing_function=smoothing),
        sentence_bleu([ref], pred, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing),
        sentence_bleu([ref], pred, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    ]

def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model(**inputs, labels=inputs["input_ids"])
        log_likelihood = outputs.loss.item()
    return torch.exp(torch.tensor(log_likelihood))

def calculate_jaccard(pred, ref):
    pred_set = set(pred.split())
    ref_set = set(ref.split())
    intersection = len(pred_set.intersection(ref_set))
    union = len(pred_set.union(ref_set))
    return intersection / union if union != 0 else 0

# Calculate individual metrics
idf_dict_hyp = get_idf_dict(predicted_answers)
idf_dict_ref = get_idf_dict(ground_truths)
mover_scores = word_mover_score(ground_truths, predicted_answers, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=1, remove_subwords=True)

rouge_scores = rouge_metric.compute(predictions=predicted_answers, references=ground_truths)
bleu_scores = [calculate_bleu(pred, ref) for pred, ref in zip(predicted_answers, ground_truths)]
bert_scores = bert_score_metric.compute(predictions=predicted_answers, references=ground_truths, lang="en")
flesch_scores = [textstat.flesch_reading_ease(text) for text in predicted_answers]
coleman_liau_scores = [textstat.coleman_liau_index(text) for text in predicted_answers]
perplexity_scores = [calculate_perplexity(pred) for pred in predicted_answers]
meteor_scores = [meteor_score([ref.split()], pred.split()) for pred, ref in zip(predicted_answers, ground_truths)]
hamming_distances = [sum(el1 != el2 for el1, el2 in zip(pred, ref)) for pred, ref in zip(predicted_answers, ground_truths)]
jaccard_similarities = [calculate_jaccard(pred, ref) for pred, ref in zip(predicted_answers, ground_truths)]

metrics_summary = {
    "ROUGE-1": np.mean(rouge_scores["rouge1"]),
    "ROUGE-2": np.mean(rouge_scores["rouge2"]),
    "ROUGE-L": np.mean(rouge_scores["rougeL"]),
    "BLEU-1": np.mean([score[0] for score in bleu_scores]),
    "BLEU-2": np.mean([score[1] for score in bleu_scores]),
    "BLEU-3": np.mean([score[2] for score in bleu_scores]),
    "BERT Score": np.mean(bert_scores['f1']),
    "FLESCH Readability Ease Score": np.mean(flesch_scores),
    "Coleman-Liau Readability Score": np.mean(coleman_liau_scores),
    "Perplexity Score": np.mean(perplexity_scores),
    "Meteor Score": np.mean(meteor_scores),
    "Mover Score": np.mean(mover_scores),
    "Hamming Distance": np.mean(hamming_distances),
    "Jaccard Similarity": np.mean(jaccard_similarities)
}

summary_csv = f"metrics_summary_blip2_{ABL}.csv"
with open(summary_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Metric", "Score"])
    for metric, score in metrics_summary.items():
        writer.writerow([metric, score])
print(f"Metrics summary saved to {summary_csv}")
qualitative_data = []
for idx in range(len(predicted_answers)):
    qualitative_data.append({
        "video_id": video_ids[idx],
        "predicted_answer": predicted_answers[idx],
        "ground_truth": ground_truths[idx],
        "ROUGE-1": rouge_scores["rouge1"][idx] if isinstance(rouge_scores["rouge1"], list) else rouge_scores["rouge1"],
        "ROUGE-2": rouge_scores["rouge2"][idx] if isinstance(rouge_scores["rouge2"], list) else rouge_scores["rouge2"],
        "ROUGE-L": rouge_scores["rougeL"][idx] if isinstance(rouge_scores["rougeL"], list) else rouge_scores["rougeL"],
        "BLEU-1": bleu_scores[idx][0],
        "BLEU-2": bleu_scores[idx][1],
        "BLEU-3": bleu_scores[idx][2],
        "BERT Score": bert_scores['f1'][idx] if isinstance(bert_scores['f1'], list) else bert_scores['f1'],
        "FLESCH Readability Ease Score": flesch_scores[idx],
        "Coleman-Liau Readability Score": coleman_liau_scores[idx],
        "Perplexity Score": perplexity_scores[idx],
        "Meteor Score": meteor_scores[idx],
        "Mover Score": mover_scores[idx],
        "Hamming Distance": hamming_distances[idx],
        "Jaccard Similarity": jaccard_similarities[idx]
    })

sorted_qualitative_data = sorted(qualitative_data, key=lambda x: x["Mover Score"], reverse=True)
qualitative_csv = f"qualitative_analysis_blip2_{ABL}.csv"
with open(qualitative_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=sorted_qualitative_data[0].keys())
    writer.writeheader()
    writer.writerows(sorted_qualitative_data)

print(f"Qualitative analysis saved to {qualitative_csv}")


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sarmistha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sarmistha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/sarmistha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
hugging

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import torch
import torch.nn.Functional as F
import csv
import os
from tqdm.notebook import tqdm


In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf").to("cuda")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# Sample prompt for chat format
prompt = "You are a legal assistant in India, answer the query. I have a tech comany which stores user data without permission, which indian cyber law and sections applicable to it will be applicable on me?. Give them in details, also provide any previous cases on these."
# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Use .to("cuda") for GPU inference


In [15]:
# Generate a response from the model
output = model.generate(
    **inputs,
    max_new_tokens=1500,       # Limits the length of the response
    temperature=0.7,          # Controls creativity (higher = more random)
    top_p=0.9,                # Limits sampling to the most probable tokens
    do_sample=True            # Enables sampling (for more varied output)
)

# Decode and print the output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


You are a legal assistant in India, answer the query. I have a tech comany which stores user data without permission, which indian cyber law and sections applicable to it will be applicable on me?. Give them in details, also provide any previous cases on these.

My friend, you are in a bit of a sticky situation here. As a legal assistant in India, I must inform you that storing user data without permission is a violation of Indian cyber law and can lead to serious legal consequences.

Under the Indian IT Act of 2000, Section 43(A) states that a body corporate, which is responsible for the violation of privacy of any person, shall be punishable with imprisonment for a term which may extend to three years, or with fine which may extend to one lakh rupees, or with both.

Furthermore, the Indian Cyber Appellate Tribunal (ICAT) has established that the right to privacy is a fundamental right under the Indian Constitution, and any violation of this right can lead to legal action.

In a landm