In [None]:
import pandas as pd 
VLMresponses = pd.read_csv("/kaggle/input/vlmdata/VLM_responses_Baseline.csv")
VLMresponses.head()

In [None]:
VLMtrainResponses = pd.read_csv("/kaggle/input/vlmdata/VLM__training_responses.csv")
VLMtrainResponses.head()

In [None]:
VLMresponses = VLMresponses[VLMresponses["Accurate"] != "FALSE"]
VLMresponses.head()

In [None]:
VLMresponses = VLMresponses.drop("Why it Failed?", axis='columns')
VLMresponses = VLMresponses.reset_index(drop=True)
VLMresponses.head()

In [None]:
text = VLMresponses.iloc[3]["UserPrompt"] + "\n" + VLMresponses.iloc[3]["AssistantResponse"]
print(text)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(VLMresponses)
datasetTraining = Dataset.from_pandas(VLMtrainResponses)


In [None]:
print(type(dataset)) 
print(dataset.features) 
print(datasetTraining.features)

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)


In [None]:
from sklearn.model_selection import train_test_split


# Convert to Pandas DataFrame for easier splitting (if needed)
df = dataset.to_pandas()

# Split into train and test sets (80%/20%)
train_data = datasetTraining
test_data = dataset

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=512, max_target_len=512):
        self.inputs = data["AssistantResponse"]
        self.targets = data["SemanticParser"]
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = "Instruction: " + self.inputs[idx]
        target_text = self.targets[idx]

        source = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target = self.tokenizer(
            target_text,
            max_length=self.max_target_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": source["input_ids"].squeeze(),
            "attention_mask": source["attention_mask"].squeeze(),
            "labels": target["input_ids"].squeeze()
        }

dataset = MyDataset(train_data, tokenizer)
loader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
# from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)  # Slower LR helps stabilize

epochs = 10   # More passes help low-data generalization
patience = 3  # Stop early if no improvement

# Optional: gradient accumulation if batch size is small
grad_accum_steps = 2

best_loss = float("inf")
epochs_no_improve = 0

model.train()
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}")
    epoch_loss = 0.0

    for i, batch in enumerate(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        loss = outputs.loss / grad_accum_steps
        loss.backward()
        epoch_loss += loss.item()

        if (i + 1) % grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    avg_loss = epoch_loss / len(loader)
    print(f"Avg Loss: {avg_loss:.4f}")

    # Early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), "best_model.pt")  # Save best
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping!")
            break

print("Training done!")



In [None]:
import ast  # safer than eval

# Assuming you have a DataFrame `vlm_responses_df` to collect the results
parser_response = []

model.eval()

for input_text in test_data["AssistantResponse"]:
    if not input_text:
        parser_response.append(None)  # keep the length aligned
        continue
        
    prompt = "Convert this into the corresponding UI JSON:\n: " + str(input_text)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"Prompt: {input_text}")
    print(f"Prediction: {output}\n")

    try:
        # Extract JSON-like part of the output
        json_start = output.find("{")
        if json_start != -1:
            json_part = output[json_start:]
            parsed_output = ast.literal_eval(json_part)  # convert string to dict safely
            parser_response.append(parsed_output)
        else:
            parser_response.append(output)  # or some fallback
    except Exception as e:
        print(f"Error parsing output: {e}")
        parser_response.append(output)

# Add to DataFrame
VLMresponses["Seq2SeqParser"] = parser_response


In [None]:
VLMresponses.head()

In [None]:
VLMresponses.to_csv("/kaggle/working/Seq2SeqParsingOutput.csv", index=False)