In [25]:
#!pip install google-cloud-aiplatform google-cloud-storage google-auth google-auth-oauthlib jsonlines bert_score sacrebleu

!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.10.1 sacrebleu-2.4.3


In [38]:
import os
import json
import jsonlines
import time
import requests
import vertexai
from vertexai.preview.tuning import sft
from google.colab import drive
from google.cloud import storage
from google.cloud import aiplatform
from google.oauth2 import service_account
from vertexai.generative_models import GenerativeModel
from bert_score import score
import random
from sacrebleu.metrics import CHRF
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import re
from bs4 import BeautifulSoup, Tag
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize
from nltk import download as nltk_download
import tinycss2

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Defining Service Account, Input Data, Output/Training Data Paths

In [None]:
# Define paths
service_account_key_path = '' # Path to service account key
input_json_path = '' # Path to input JSON file
training_jsonl_output_path = '' # Path to training JSONL file
test_jsonl_output_path = '' # Path to test JSONL file

In [5]:
# Set the environment variable for service account credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_key_path

#### Define Project ID, Region, and GCS Bucket URI

In [None]:
# Define Project ID, Region, and GCS Bucket URI
PROJECT_ID = ""  # Replace with your Google Cloud Project ID
REGION = "us-central1"  # Desired region for Vertex AI
BUCKET_NAME = f"{PROJECT_ID}-sft-gemini-demo"
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [7]:
# Initialize Vertex AI with credentials from environment variable
vertexai.init(project=PROJECT_ID, location=REGION)

####  Converting and preparing Training data

In [8]:
# Define desired prompts for test set
desired_prompts = [
    "Create a header for a portfolio website with a focus on personal branding using a minimalist and modern look. Integrate HTML components like a headline, description, and call-to-action buttons, along with social media links. Use CSS for cohesive typography, vibrant color highlights, and a user-centered layout. Aim for visual hierarchy, intuitive navigation, and a responsive design to enhance user engagement.",
    "Create a header of a Professional Portfolio Website focusing on a creative and vibrant layout. Utilize HTML for structured sections like About Me, Services, and buttons such as Hire Me and Download CV. Style with CSS to emphasize a modern UI, incorporating color gradients, rounded buttons, and clear typography, enhancing readability and user interaction while ensuring visual consistency.",
    "Create a header of a fashion e-commerce website, featuring a clean, modern look. Include an announcement bar, logo, navigation menu for New Arrivals, Swimwear, Collections, About Us, and Demos, a search icon, account link, and cart icon. Use responsive design, bold typography, intuitive layout, and strategically placed call-to-action buttons to enhance user experience and engagement.",
    "Create a header of a Portfolio Website for a Freelance UI/UX Designer with a modern and visually dynamic look. Implement hero imagery, impactful typography, clear call to action buttons, and a well-structured introduction using HTML and CSS. Emphasize visual hierarchy, white space, and responsive design, adhering to UI/UX principles to enhance user engagement and accessibility.",
    "Create a header of a Creative Agency Website, emphasizing a modern and minimalist design. Utilize HTML and CSS to structure a responsive navigation bar with bold branding, clear call-to-actions, and intuitive menu links. Implement visually striking elements, balanced whitespace, and consistent typography to enhance user engagement while adhering to UI/UX best practices for accessibility and seamless interaction.",
    "Create a header of an eCommerce website under the \"Products\" category, featuring a modern and clean look with navigation links (Home, Recent Products, Brands, Contact, About), a search bar, cart icon, and branding. Include all HTML and CSS components to ensure responsiveness and intuitive UI/UX, optimizing for user engagement and seamless shopping experience."
]

In [9]:
# Split dataset into training and test sets
def split_dataset(input_json_path, training_jsonl_output_path, test_jsonl_output_path, desired_prompts):
    with open(input_json_path, 'r') as f:
        original_data = json.load(f)

    training_data = []
    test_data = []

    for example in original_data:
        if example["instruction"] in desired_prompts:
            test_data.append(example)
        else:
            training_data.append(example)

    # Save training data to JSONL
    with jsonlines.open(training_jsonl_output_path, 'w') as writer:
        for example in training_data:
            writer.write({
                "messages": [
                    {"role": "user", "content": example["instruction"]},
                    {"role": "model", "content": example["output"]}
                ]
            })

    # Save test data to JSONL
    with jsonlines.open(test_jsonl_output_path, 'w') as writer:
        for example in test_data:
            writer.write({
                "messages": [
                    {"role": "user", "content": example["instruction"]},
                    {"role": "model", "content": example["output"]}
                ]
            })

    print(f"Training data saved to {training_jsonl_output_path}. Total examples: {len(training_data)}")
    print(f"Test data saved to {test_jsonl_output_path}. Total examples: {len(test_data)}")

# Run the function to split the dataset
split_dataset(input_json_path, training_jsonl_output_path, test_jsonl_output_path, desired_prompts)

Training data saved to /content/drive/MyDrive/Sem 3/DATA 298 B/Data/training_data_gemini.jsonl. Total examples: 256
Test data saved to /content/drive/MyDrive/Sem 3/DATA 298 B/Data/test_data_gemini.jsonl. Total examples: 7


#### Initializing and Creating bucket

In [10]:
# Initialize GCS client and upload the data
client = storage.Client(project=PROJECT_ID)
bucket = client.bucket(BUCKET_NAME)

# Create bucket if it does not exist
if not bucket.exists():
    bucket.create(location=REGION)
    print(f"Bucket created: {BUCKET_URI}")
else:
    print(f"Bucket already exists: {BUCKET_URI}")

Bucket created: gs://sonorous-pact-405102-sft-gemini-demo


In [None]:
# Upload the training JSONL file to GCS
training_gcs_blob_path = '' # Path to training JSONL file in GCS
training_blob = bucket.blob(training_gcs_blob_path)
training_blob.upload_from_filename(training_jsonl_output_path)
print(f"Training data uploaded to gs://{BUCKET_NAME}/{training_gcs_blob_path}")

# Upload the test JSONL file to GCS (optional)
test_gcs_blob_path = 'fine_tune_data_gemini_test.jsonl'
test_blob = bucket.blob(test_gcs_blob_path)
test_blob.upload_from_filename(test_jsonl_output_path)
print(f"Test data uploaded to gs://{BUCKET_NAME}/{test_gcs_blob_path}")

Training data uploaded to gs://sonorous-pact-405102-sft-gemini-demo/fine_tune_data_gemini_training.jsonl
Test data uploaded to gs://sonorous-pact-405102-sft-gemini-demo/fine_tune_data_gemini_test.jsonl


#### Defining model and Fine Tuning Job

In [12]:
# Define model and training dataset information
MODEL_ID = "gemini-1.0-pro-002"
TRAINING_DATA_URI = f"gs://{BUCKET_NAME}/fine_tune_data_gemini_training.jsonl"  # Update to the specific training data path

# Start the fine-tuning job
sft_tuning_job = sft.train(
    source_model=MODEL_ID,
    train_dataset=TRAINING_DATA_URI
)

# Monitor tuning job progress
print("Starting tuning job...")
try:
    while not sft_tuning_job.has_ended:
        time.sleep(60)
        sft_tuning_job.refresh()
        print("Tuning job in progress...")
except Exception as e:
    print(f"Error during tuning: {e}")

# Display tuning job information after completion
if sft_tuning_job.has_ended:
    print("Tuning job completed.")
    print("Tuning Job Info:", sft_tuning_job.to_dict())
    print(f"Tuned Model name: {sft_tuning_job.tuned_model_name}")
    print(f"Endpoint name: {sft_tuning_job.tuned_model_endpoint_name}")
else:
    print("Tuning job did not complete successfully.")


INFO:vertexai.tuning._tuning:Creating SupervisedTuningJob
INFO:vertexai.tuning._tuning:SupervisedTuningJob created. Resource name: projects/311149115011/locations/us-central1/tuningJobs/4075635445180923904
INFO:vertexai.tuning._tuning:To use this SupervisedTuningJob in another session:
INFO:vertexai.tuning._tuning:tuning_job = sft.SupervisedTuningJob('projects/311149115011/locations/us-central1/tuningJobs/4075635445180923904')
INFO:vertexai.tuning._tuning:View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/4075635445180923904?project=311149115011


Starting tuning job...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...


Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job in progress...
Tuning job completed.
Tuning Job Info: {'name': 'projects/311149115011/locations/us-central1/tuningJobs/4075635445180923904', 'tunedModelDisplayName': 'SupervisedTuningJob 2024-11-12 01:46:45.267799', 'baseModel': 'gemini-1.0-pro-002', 'supervise

#### Generating outputs

# Evaluation

In [None]:
# Define the full model path based on the tuning job info
MODEL_NAME = "" # Define model Endpoint name here

# Load the fine-tuned model
model = GenerativeModel(model_name=MODEL_NAME)

# Test with a prompt similar to your training data
try:
    # Sample prompts from your training data
    prompts = [
      "Create a header for a minimalist personal website, focusing on clean typography and a simple navigation bar.",
      "Design a bold and striking header for a creative agency website, incorporating a strong visual element and a prominent call-to-action.",
      "Develop a functional and user-friendly header for an e-commerce website, including a logo, search bar, shopping cart icon, and clear navigation.",
      "Create a visually appealing header for a blog website, featuring a large hero image, a tagline, and a simple navigation bar.",
      ]

    # Send each prompt to the model and print the generated responses
    for prompt in prompts:
        response = model.generate_content(prompt)
        print(f"Prompt: {prompt}\nGenerated Response:\n{response.text}\n")
        print("-" * 50)

except Exception as e:
    print("Error generating response:", e)

Prompt: Create a header for a minimalist personal website, focusing on clean typography and a simple navigation bar.
Generated Response:
<html>
<head>
<style>
.nav-bar {
  	position: relative;
  	height: 72px;
  	background-color: #fff;
  	display: flex;
  	flex-direction: row;
  	align-items: center;
  	justify-content: center;
  	padding: 0px 5%;
}
.nav {
  	display: flex;
  	flex-direction: row;
  	align-items: center;
  	justify-content: space-between;
  	gap: 40px;
}
.nav-item {
  	font-family: Inter;
  	font-weight: 500;
  	font-size: 16px;
  	display: flex;
  	flex-direction: row;
  	align-items: center;
  	justify-content: flex-start;
  	gap: 12px;
}
.home {
  	position: relative;
  	border-bottom: 1px solid #40b25d;
  	box-sizing: border-box;
  	padding: 14px 0;
}
.logo {
  	position: relative;
  	width: 66px;
  	height: 66px;
  	object-fit: cover;
}


body {
  	margin: 0;
  	line-height: normal;
}
</style>
</head>
<body>
<!DOCTYPE html>
<html>
<head>
  	<meta charset="utf-8">

# Evaluation

In [None]:
# Define the path to your test data JSONL file
test_jsonl_output_path = '' # Path to test JSONL file

# Read and print test data
test_data = []
with jsonlines.open(test_jsonl_output_path, 'r') as reader:
    for obj in reader:
        test_data.append(obj)

# Display the total number of examples in the test data
print(f"Total examples in test data: {len(test_data)}")

# # Print the first few examples in test data to verify content
# for i, example in enumerate(test_data[:5]):  # Adjust range as needed
#     print(f"\nExample {i + 1}:")
#     print("User Prompt:", example['messages'][0]['content'])
#     print("Expected Output:", example['messages'][1]['content'])


Total examples in test data: 7


### Calculating BERTScore

In [None]:
# Define endpoint of the fine-tuned Gemini model
gemini_model_endpoint = ""  # Replace with actual endpoint

# Initialize the Generative Model
model = GenerativeModel(model_name=gemini_model_endpoint)

# Extract reference responses (ground truth) from the test data
reference_responses = [item['messages'][1]['content'] for item in test_data]  # Adjusted index to correctly access 'content'

# Generate model predictions for each item in the test data
model_predictions = []
for item in test_data:
    prompt = item['messages'][0]['content']  # Adjusted to reference the user's prompt

    try:
        # Generate content using the Gemini model
        response = model.generate_content(prompt)
        model_predictions.append(response.text.strip())  # Append generated text
    except Exception as e:
        print("Error generating response:", e)
        model_predictions.append("")  # Append empty string if there's an error

# Calculate BERTScore
P, R, F1 = score(model_predictions, reference_responses, lang="en", rescale_with_baseline=True)

# Display the average F1 score as the similarity metric
average_f1_score = F1.mean().item()
print(f"Average BERTScore F1 for fine-tuned Gemini model: {average_f1_score:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1 for fine-tuned Gemini model: 0.6421


### Evaluating CHRF (CHaRacter-level F-score)

In [None]:
# Initialize the Generative Model with the Gemini endpoint
gemini_model_endpoint = ""  # Replace with actual endpoint
model = GenerativeModel(model_name=gemini_model_endpoint)

# Placeholder lists for model predictions and reference responses
reference_responses = [item['messages'][1]['content'] for item in test_data]

# Set a sample size to limit the number of calls
sample_size = min(6, len(reference_responses))  # Adjust sample size as needed
test_data_sample = random.sample(test_data, sample_size)

# Placeholder for model predictions
model_predictions = []

# Generate predictions for each sampled item
for item in test_data_sample:
    prompt = item['messages'][0]['content']  # Use the user's input

    try:
        # Generate content using the Gemini model
        response = model.generate_content(prompt)
        completion = response.text.strip()
        if completion:
            model_predictions.append(completion)  # Append non-empty predictions
    except Exception as e:
        print("Error generating response:", e)
        model_predictions.append("")  # Append empty string if there's an error

# Filter out empty predictions and align references accordingly
model_predictions = [pred for pred in model_predictions if pred]
reference_responses = reference_responses[:len(model_predictions)]

# Verify lengths for CHRF calculation
assert len(model_predictions) == len(reference_responses), "The lengths of predictions and references do not match."

# Calculate CHRF score
chrf = CHRF()
score = chrf.corpus_score(model_predictions, [reference_responses])
print(f"CHRF Score: {score.score:.4f}")

CHRF Score: 44.1819


### Evaluating BLEU

In [None]:
# Define the endpoint of your fine-tuned Gemini model
gemini_model_endpoint = ""  # Replace with actual endpoint
model = GenerativeModel(model_name=gemini_model_endpoint)

# Placeholder lists for model predictions and reference responses
reference_responses = [item['messages'][1]['content'] for item in test_data]

# Set sample size to limit the number of calls for testing
sample_size = min(6, len(reference_responses))  # Adjust sample size as needed
test_data_sample = random.sample(test_data, sample_size)

# Generate predictions for each sampled item
model_predictions = []
for item in test_data_sample:
    prompt = item['messages'][0]['content']  # Use the user's input

    try:
        # Generate content using the Gemini model
        response = model.generate_content(prompt)
        completion = response.text.strip()
        if completion:
            model_predictions.append(completion)  # Append non-empty predictions
    except Exception as e:
        print("Error generating response:", e)
        model_predictions.append("")  # Append empty string if there's an error

# Filter out empty predictions and align references accordingly
model_predictions = [pred for pred in model_predictions if pred]
reference_responses = reference_responses[:len(model_predictions)]

# Verify lengths for BLEU calculation
assert len(model_predictions) == len(reference_responses), "The lengths of predictions and references do not match."

# Define smoothing function to avoid BLEU score of 0 for short texts
smoothie = SmoothingFunction().method4

# Calculate BLEU scores for each prediction-reference pair
bleu_scores = []
for pred, ref in zip(model_predictions, reference_responses):
    # Tokenize both prediction and reference as lists of words
    pred_tokens = pred.split()
    ref_tokens = ref.split()

    # Calculate BLEU score for this pair with smoothing
    score = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)
    bleu_scores.append(score)

# Calculate the average BLEU score across all samples
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu_score:.4f}")


Average BLEU Score: 0.1531


### Evaluating Ruby

In [None]:
# Download necessary NLTK data
nltk_download('punkt')

# Define the model endpoint for the Gemini model
gemini_model_endpoint = ""  # Replace with actual endpoint

# Initialize the Generative Model
model = GenerativeModel(model_name=gemini_model_endpoint)

# Function to extract relevant criteria from prompt
def extract_relevant_criteria(prompt):
    relevant_criteria = {
        "has_header_container": "header" in prompt.lower(),
        "has_basic_html_structure": "html structure" in prompt.lower(),
        "has_navigation_menu": "navigation" in prompt.lower() or "menu" in prompt.lower(),
        "has_responsive_design": "responsive" in prompt.lower(),
        "includes_css_styling": "css" in prompt.lower() or "style" in prompt.lower(),
        "has_call_to_action": "call-to-action" in prompt.lower() or "button" in prompt.lower(),
        "uses_semantic_tags": "semantic" in prompt.lower() or "structure" in prompt.lower(),
        "no_syntax_errors": True  # Assume BeautifulSoup can parse without errors
    }
    return {k: v for k, v in relevant_criteria.items() if v}

# Success criteria definition for Pass@1 evaluation
def evaluate_pass_at_1(generated_code, relevant_criteria):
    soup = BeautifulSoup(generated_code, 'html.parser')
    results = {}
    if relevant_criteria.get("has_header_container"):
        results["has_header_container"] = bool(soup.find('header') or soup.find('div', {'class': re.compile('header', re.IGNORECASE)}))
    if relevant_criteria.get("has_basic_html_structure"):
        results["has_basic_html_structure"] = all([
            bool(soup.find('html')),
            bool(soup.find('head')),
            bool(soup.find('body'))
        ])
    if relevant_criteria.get("has_navigation_menu"):
        results["has_navigation_menu"] = bool(soup.find('nav') or soup.find('ul') or soup.find_all('a', href=True))
    if relevant_criteria.get("has_responsive_design"):
        results["has_responsive_design"] = '@media' in generated_code or bool(soup.find('meta', {'name': 'viewport'}))
    if relevant_criteria.get("includes_css_styling"):
        results["includes_css_styling"] = '<style>' in generated_code or 'stylesheet' in generated_code or 'style=' in generated_code
    if relevant_criteria.get("has_call_to_action"):
        results["has_call_to_action"] = bool(soup.find('a', {'class': re.compile('button|cta', re.IGNORECASE)}) or soup.find('button'))
    if relevant_criteria.get("uses_semantic_tags"):
        results["uses_semantic_tags"] = any(tag.name in ['header', 'nav', 'section'] for tag in soup.find_all())
    if relevant_criteria.get("no_syntax_errors"):
        results["no_syntax_errors"] = True  # BeautifulSoup parsing indicates no syntax errors

    pass_at_1 = all(results.values())
    return pass_at_1, results

# Additional metrics calculations for RUBY
def calculate_sts_similarity(generated_code, reference_code):
    generated_tokens = word_tokenize(generated_code)
    reference_tokens = word_tokenize(reference_code)
    matcher = SequenceMatcher(None, generated_tokens, reference_tokens)
    return matcher.ratio()

def ruby_metric(generated_code, reference_code):
    sts_score = calculate_sts_similarity(generated_code, reference_code)
    trs_score = sts_score  # Placeholder for TRS in case AST metrics are not used
    combined_score = (sts_score + trs_score) / 2
    return sts_score, trs_score, combined_score

# Function to evaluate Pass@3
def evaluate_pass_at_3(generated_code_variants, relevant_criteria):
    pass_found = False
    results_list = []
    for generated_code in generated_code_variants:
        pass_at_1, results = evaluate_pass_at_1(generated_code, relevant_criteria)
        results_list.append(results)
        if pass_at_1:
            pass_found = True
            break
    return pass_found, results_list

# Main evaluation function with reference and generated code
def evaluate_model(test_data, sample_size=5):
    # Sample a subset of test data for evaluation
    sample_data = random.sample(test_data, sample_size)
    pass_3_count = 0
    sts_scores, trs_scores, combined_scores = [], [], []

    for item in sample_data:
        # Extract prompt, generated_code, and reference_code
        prompt = item['messages'][0]['content']  # The user's request prompt
        generated_code = item['messages'][1]['content']  # Model's generated response
        reference_code = item['messages'][-1]['content']  # Assuming last message has reference code

        relevant_criteria = extract_relevant_criteria(prompt)

        # Generate up to 3 outputs and evaluate each one
        generated_variants = [generated_code]  # Start with the provided generated output
        for _ in range(2):  # Generate additional variations if needed
            try:
                response = model.generate_content(prompt)
                variant = response.text.strip()
                if variant:
                    generated_variants.append(variant)
            except Exception as e:
                print("Error generating response:", e)

        # Pass@3 evaluation
        pass_at_3, criteria_results_list = evaluate_pass_at_3(generated_variants, relevant_criteria)
        if pass_at_3:
            pass_3_count += 1

        # RUBY metric calculation (using the last variant in this case)
        sts_score, trs_score, combined_score = ruby_metric(generated_variants[-1], reference_code)
        sts_scores.append(sts_score)
        trs_scores.append(trs_score)
        combined_scores.append(combined_score)

        # Log each sample's results
        print("-" * 50)
        print(f"Prompt: {prompt}")
        print(f"Pass@3:", "Passed" if pass_at_3 else "Failed")
        print("RUBY Scores - STS:", sts_score, "TRS:", trs_score, "Combined:", combined_score)
        print("Detailed Criteria Results for Variants:")
        for i, results in enumerate(criteria_results_list, 1):
            print(f"  Variant {i}:")
            for criterion, passed in results.items():
                status = "Pass" if passed else "Fail"
                print(f"    {criterion}: {status}")
        print("-" * 50)

    # Calculate averages and Pass@3 rate
    avg_sts_score = sum(sts_scores) / len(sts_scores) if sts_scores else 0
    avg_trs_score = sum(trs_scores) / len(trs_scores) if trs_scores else 0
    avg_combined_score = sum(combined_scores) / len(combined_scores) if combined_scores else 0
    pass_at_3_rate = (pass_3_count / len(sample_data)) * 100 if sample_data else 0

    print(f"Pass@3 Rate for {sample_size} samples: {pass_at_3_rate:.2f}%")
    print(f"Average RUBY STS Score: {avg_sts_score}")
    print(f"Average RUBY TRS Score: {avg_trs_score}")
    print(f"Average RUBY Combined Score: {avg_combined_score}")

# Run evaluation on test_data with a sample size of 6
evaluate_model(test_data, sample_size=6)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


--------------------------------------------------
Prompt: Create a header of a Portfolio Website for a Freelance UI/UX Designer with a modern and visually dynamic look. Implement hero imagery, impactful typography, clear call to action buttons, and a well-structured introduction using HTML and CSS. Emphasize visual hierarchy, white space, and responsive design, adhering to UI/UX principles to enhance user engagement and accessibility.
Pass@3: Failed
RUBY Scores - STS: 0.16374822527212493 TRS: 0.16374822527212493 Combined: 0.16374822527212493
Detailed Criteria Results for Variants:
  Variant 1:
    has_header_container: Fail
    has_responsive_design: Pass
    includes_css_styling: Pass
    has_call_to_action: Fail
    uses_semantic_tags: Fail
    no_syntax_errors: Pass
  Variant 2:
    has_header_container: Fail
    has_responsive_design: Fail
    includes_css_styling: Pass
    has_call_to_action: Fail
    uses_semantic_tags: Fail
    no_syntax_errors: Pass
  Variant 3:
    has_header

In [None]:
# Download necessary NLTK data
nltk_download('punkt')

# Define the model endpoint for the Gemini model
gemini_model_endpoint = ""  # Replace with actual endpoint

# Initialize the Generative Model
model = GenerativeModel(model_name=gemini_model_endpoint)

# Function to extract relevant criteria from prompt
def extract_relevant_criteria(prompt):
    relevant_criteria = {
        "has_header_container": "header" in prompt.lower(),
        "has_basic_html_structure": "html structure" in prompt.lower(),
        "has_navigation_menu": "navigation" in prompt.lower() or "menu" in prompt.lower(),
        "has_responsive_design": "responsive" in prompt.lower(),
        "includes_css_styling": "css" in prompt.lower() or "style" in prompt.lower(),
        "has_call_to_action": "call-to-action" in prompt.lower() or "button" in prompt.lower(),
        "uses_semantic_tags": "semantic" in prompt.lower() or "structure" in prompt.lower(),
        "no_syntax_errors": True  # Assume BeautifulSoup can parse without errors
    }
    return {k: v for k, v in relevant_criteria.items() if v}

# Success criteria definition for Pass@1 evaluation
def evaluate_pass_at_1(generated_code, relevant_criteria):
    soup = BeautifulSoup(generated_code, 'html.parser')
    results = {}
    if relevant_criteria.get("has_header_container"):
        results["has_header_container"] = bool(soup.find('header') or soup.find('div', {'class': re.compile('header', re.IGNORECASE)}))
    if relevant_criteria.get("has_basic_html_structure"):
        results["has_basic_html_structure"] = all([
            bool(soup.find('html')),
            bool(soup.find('head')),
            bool(soup.find('body'))
        ])
    if relevant_criteria.get("has_navigation_menu"):
        results["has_navigation_menu"] = bool(soup.find('nav') or soup.find('ul') or soup.find_all('a', href=True))
    if relevant_criteria.get("has_responsive_design"):
        results["has_responsive_design"] = '@media' in generated_code or bool(soup.find('meta', {'name': 'viewport'}))
    if relevant_criteria.get("includes_css_styling"):
        results["includes_css_styling"] = '<style>' in generated_code or 'stylesheet' in generated_code or 'style=' in generated_code
    if relevant_criteria.get("has_call_to_action"):
        results["has_call_to_action"] = bool(soup.find('a', {'class': re.compile('button|cta', re.IGNORECASE)}) or soup.find('button'))
    if relevant_criteria.get("uses_semantic_tags"):
        results["uses_semantic_tags"] = any(tag.name in ['header', 'nav', 'section'] for tag in soup.find_all())
    if relevant_criteria.get("no_syntax_errors"):
        results["no_syntax_errors"] = True  # BeautifulSoup parsing indicates no syntax errors

    pass_at_1 = all(results.values())
    return pass_at_1, results

# Additional metrics calculations for RUBY
def calculate_sts_similarity(generated_code, reference_code):
    generated_tokens = word_tokenize(generated_code)
    reference_tokens = word_tokenize(reference_code)
    matcher = SequenceMatcher(None, generated_tokens, reference_tokens)
    return matcher.ratio()

def ruby_metric(generated_code, reference_code):
    sts_score = calculate_sts_similarity(generated_code, reference_code)
    trs_score = sts_score  # Placeholder for TRS in case AST metrics are not used
    combined_score = (sts_score + trs_score) / 2
    return sts_score, trs_score, combined_score

# Function to evaluate Pass@3
def evaluate_pass_at_3(generated_code_variants, relevant_criteria):
    pass_found = False
    results_list = []
    for generated_code in generated_code_variants:
        pass_at_1, results = evaluate_pass_at_1(generated_code, relevant_criteria)
        results_list.append(results)
        if pass_at_1:
            pass_found = True
            break
    return pass_found, results_list

# Main evaluation function with reference and generated code
def evaluate_model(test_data, sample_size=5):
    # Sample a subset of test data for evaluation
    sample_data = random.sample(test_data, sample_size)
    pass_1_count = 0
    pass_3_count = 0
    sts_scores, trs_scores, combined_scores = [], [], []

    for item in sample_data:
        # Extract prompt, generated_code, and reference_code
        prompt = item['messages'][0]['content']  # The user's request prompt
        generated_code = item['messages'][1]['content']  # Model's generated response
        reference_code = item['messages'][-1]['content']  # Assuming last message has reference code

        relevant_criteria = extract_relevant_criteria(prompt)

        # Pass@1 evaluation
        pass_at_1, criteria_results = evaluate_pass_at_1(generated_code, relevant_criteria)
        if pass_at_1:
            pass_1_count += 1

        # Log Pass@1 results
        print("-" * 50)
        print(f"Prompt: {prompt}")
        print("Pass@1:", "Passed" if pass_at_1 else "Failed")
        print("Pass@1 Criteria Results:", criteria_results)

        # Generate up to 3 outputs and evaluate each one for Pass@3
        generated_variants = [generated_code]  # Start with the provided generated output
        for _ in range(2):  # Generate additional variations if needed
            try:
                response = model.generate_content(prompt)
                variant = response.text.strip()
                if variant:
                    generated_variants.append(variant)
            except Exception as e:
                print("Error generating response:", e)

        # Pass@3 evaluation
        pass_at_3, criteria_results_list = evaluate_pass_at_3(generated_variants, relevant_criteria)
        if pass_at_3:
            pass_3_count += 1

        # RUBY metric calculation (using the last variant in this case)
        sts_score, trs_score, combined_score = ruby_metric(generated_variants[-1], reference_code)
        sts_scores.append(sts_score)
        trs_scores.append(trs_score)
        combined_scores.append(combined_score)

        # Log each sample's results for Pass@3
        print(f"Pass@3:", "Passed" if pass_at_3 else "Failed")
        print("RUBY Scores - STS:", sts_score, "TRS:", trs_score, "Combined:", combined_score)
        print("Detailed Criteria Results for Variants:")
        for i, results in enumerate(criteria_results_list, 1):
            print(f"  Variant {i}:")
            for criterion, passed in results.items():
                status = "Pass" if passed else "Fail"
                print(f"    {criterion}: {status}")
        print("-" * 50)

    # Calculate averages and Pass@3 rate
    avg_sts_score = sum(sts_scores) / len(sts_scores) if sts_scores else 0
    avg_trs_score = sum(trs_scores) / len(trs_scores) if trs_scores else 0
    avg_combined_score = sum(combined_scores) / len(combined_scores) if combined_scores else 0
    pass_at_1_rate = (pass_1_count / len(sample_data)) * 100 if sample_data else 0
    pass_at_3_rate = (pass_3_count / len(sample_data)) * 100 if sample_data else 0

    # Summary of results
    print(f"Pass@1 Rate for {sample_size} samples: {pass_at_1_rate:.2f}%")
    print(f"Pass@3 Rate for {sample_size} samples: {pass_at_3_rate:.2f}%")
    print(f"Average RUBY STS Score: {avg_sts_score}")
    print(f"Average RUBY TRS Score: {avg_trs_score}")
    print(f"Average RUBY Combined Score: {avg_combined_score}")

# Run evaluation on test_data with a sample size of 6
evaluate_model(test_data, sample_size=6)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


--------------------------------------------------
Prompt: Create a header of a Portfolio Website for a Freelance UI/UX Designer with a modern and visually dynamic look. Implement hero imagery, impactful typography, clear call to action buttons, and a well-structured introduction using HTML and CSS. Emphasize visual hierarchy, white space, and responsive design, adhering to UI/UX principles to enhance user engagement and accessibility.
Pass@1: Failed
Pass@1 Criteria Results: {'has_header_container': False, 'has_responsive_design': True, 'includes_css_styling': True, 'has_call_to_action': False, 'uses_semantic_tags': False, 'no_syntax_errors': True}
Pass@3: Failed
RUBY Scores - STS: 0.22718052738336714 TRS: 0.22718052738336714 Combined: 0.22718052738336714
Detailed Criteria Results for Variants:
  Variant 1:
    has_header_container: Fail
    has_responsive_design: Pass
    includes_css_styling: Pass
    has_call_to_action: Fail
    uses_semantic_tags: Fail
    no_syntax_errors: Pass
  