In [None]:
from io import StringIO
from google import genai
from google.genai.types import CreateBatchJobConfig, GenerateContentConfig, Part
from google.cloud import storage
from tqdm import tqdm
from dotenv import load_dotenv
import boto3
import csv
import fsspec
import json
import jsonlines
import os
import pandas as pd
import time
import random

In [None]:
load_dotenv()

# Sampling data

In [None]:
s3 = boto3.client("s3")
raw_bucket_name = "israllm-datasets"
clean_bucket_name = "gepeta-datasets"
prefix = "processed_and_cleaned"
n_samples = 1_000

In [None]:
raw_clean_map = {"AllHebNLI": {"raw": "raw-datasets/nli/csv_output/", "clean": f"{prefix}/AllHebNLI/", 'source_name': 'AllHebNLI'},
                 "AllOfHEOscarData-Combined-Deduped-DC4.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/AllOfHEOscarData", 'source_name': 'AllOfHEOscarData-Combined-Deduped-DC4.forgpt'},
                 "AllTzenzuraData-Combined-Deduped-DC4.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/AllTzenzuraData", 'source_name': 'AllTzenzuraData-Combined-Deduped-DC4.forgpt'},
                 "BooksNLI2-Combined-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/BooksNLI2", 'source_name': 'BooksNLI2-Combined-Deduped.forgpt'},
                 "GeektimeCorpus-Combined-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/GeektimeCorpus", 'source_name': 'GeektimeCorpus-Combined-Deduped.forgpt'},
                 "hebrew_tweets_text_clean_full-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/hebrew_tweets", 'source_name': 'hebrew_tweets_text_clean_full-Deduped.forgpt'},
                 "HeC4DictaCombined-Clean-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/HeC4DictaCombined", 'source_name': 'HeC4DictaCombined-Clean-Deduped.forgpt'},
                 "YifatDataBatch2-Round3-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/YifatDataBatch2/", 'source_name': 'YifatDataBatch2-Round3-Deduped.forgpt'},
                 "YifatDataRound2-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/YifatDataRound2", 'source_name': 'YifatDataRound2-Deduped.forgpt'},
                 "YifatToCombine-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/YifatToCombine", 'source_name': 'YifatToCombine-Deduped.forgpt'},
                 "YisraelHayomData-Combined-Deduped.forgpt": {"raw": "raw-datasets/rar/csv_output/", "clean": f"{prefix}/YisraelHayomData", 'source_name': 'YisraelHayomData-Combined-Deduped.forgpt'},
                 "FineWeb2": {"raw": "raw-datasets/fineweb2", "clean": f"{prefix}/FineWeb2/", 'source_name': 'batch'},
                 "HeC4": {"raw": "raw-datasets/HeC4", "clean": f"{prefix}/HeC4-HF", 'source_name': 'part'},
                 "SupremeCourtOfIsrael": {"raw": "raw-datasets/SupremeCourtOfIsrael/text_extraction/", "clean": f"{prefix}/SupremeCourtOfIsrael", 'source_name': 'batch'},
                 "YifatDataBatch2-Round4": {"raw": "raw-datasets/Yifat4+5/csv_output", "clean": f"{prefix}/YifatDataBatch2-Round4", 'source_name': 'YifatDataBatch2-Round4'},
                 "YifatDataBatch3-Round5": {"raw": "raw-datasets/Yifat4+5/csv_output", "clean": f"{prefix}/YifatDataBatch2-Round5/", 'source_name': 'YifatDataBatch3-Round5'},
                 "OcrTau": {"raw": "tau_clean/", "clean": f"{prefix}/TauOCR", 'source_name': ["HQ", 'Markdown_Table']},
                 "TauDigital": {"raw": "tau_clean/", "clean": f"{prefix}/TauDigital", 'source_name': 'DigitalMarkdown_Tables'},
                 "BIU": {"raw": "raw-datasets/biu-drive/", "clean": f"{prefix}/BIU", 'source_name': 'AllBIUDriveDocs-MD-Deduped.forgpt'},
                 "sefaria": {"raw": "raw-datasets/sefaria/", "clean": f"{prefix}/sefaria", 'source_name': 'AllOfSefariaTexts'}}

In [None]:
len(raw_clean_map)

In [None]:
def read_data_from_s3(bucket_name, object_key):
    """
    Read CSV or JSONL file from S3 into a pandas DataFrame
    
    Args:
        object_key (str): S3 object key
        
    Returns:
        pd.DataFrame: DataFrame containing the data
    """
    try:
        response = s3.get_object(
            Bucket=bucket_name,
            Key=object_key
        )
        
        content = response['Body'].read().decode('utf-8')
        
        # Determine file type and read accordingly
        if object_key.lower().endswith('.csv'):
            df = pd.read_csv(StringIO(content))
        elif object_key.lower().endswith('.jsonl'):
            lines = content.strip().split('\n')
            data = [json.loads(line) for line in lines if line.strip()]
            df = pd.DataFrame(data)
        else:
            print(f"Unsupported file type for {object_key}")
            return pd.DataFrame()
        
        print(f"Successfully read {object_key} with {len(df)} rows")
        return df
        
    except Exception as e:
        print(f"Error reading {object_key}: {e}")
        return pd.DataFrame()
    

def get_random_samples(df, n_samples=1):
        """
        Get a random row from the DataFrame
        
        Args:
            df (pd.DataFrame): Input DataFrame
            
        Returns:
            pd.Series: Random row from the DataFrame
        """
        if df.empty or n_samples <= 0:
            return pd.DataFrame()
        
        # If requesting more samples than available, return all rows
        if n_samples >= len(df):
            print(f"Requested {n_samples} samples but only {len(df)} rows available")
            return df.copy()
        
        # Sample without replacement
        return df.sample(n=n_samples, random_state=random.randint(0, 10000))
def distribute_samples_across_files( data_objects, n_samples, distribution_method='uniform'):
    """
    Determine how many samples to collect from each file
    
    Args:
        data_objects (list): List of data file keys
        n_samples (int): Total number of samples to collect
        distribution_method (str): How to distribute samples across files
            - 'uniform': Equal samples from each file
            - 'random': Random allocation across files
            - 'weighted_random': Weighted random (files can get 0 to multiple samples)
            
    Returns:
        dict: Mapping of file_key -> number_of_samples
    """
    if not data_objects:
        return {}
        
    sample_distribution = {}
    
    if distribution_method == 'uniform':
        # Distribute samples as evenly as possible across all files
        samples_per_file = n_samples // len(data_objects)
        remainder = n_samples % len(data_objects)
        
        for i, file_key in enumerate(data_objects):
            # Give each file the base amount, plus 1 extra for the first 'remainder' files
            sample_distribution[file_key] = samples_per_file + (1 if i < remainder else 0)
            
    elif distribution_method == 'random':
        # Randomly assign each sample to a file (some files may get 0, others multiple)
        for file_key in data_objects:
            sample_distribution[file_key] = 0
            
        for _ in range(n_samples):
            random_file = random.choice(data_objects)
            sample_distribution[random_file] += 1
            
    elif distribution_method == 'weighted_random':
        # Use random weights to distribute samples
        weights = [random.random() for _ in data_objects]
        total_weight = sum(weights)
        
        allocated = 0
        for i, file_key in enumerate(data_objects[:-1]):  # Handle last file separately
            file_samples = int((weights[i] / total_weight) * n_samples)
            sample_distribution[file_key] = file_samples
            allocated += file_samples
        
        # Give remaining samples to the last file
        sample_distribution[data_objects[-1]] = n_samples - allocated
        
    else:
        raise ValueError(f"Unknown distribution method: {distribution_method}")
        
    # Log the distribution
    print(f"Sample distribution ({distribution_method}):")
    for file_key, count in sample_distribution.items():
        if count > 0:
            print(f"  {file_key}: {count} samples")
            
    return sample_distribution
def sample_dataset(dataset_name, n_samples, both=False, max_retries=3):
    """
    Sample n_samples from the dataset.
    If both is True, then return couples - from raw directory and from cleaned.
    """

    if both:
        raw_response = s3.list_objects_v2(Bucket=raw_bucket_name, Prefix=raw_clean_map[dataset_name]["raw"])
        if "Contents" not in raw_response:
            print(f"Didn't find raw for {dataset_name}")
            return
        raw_response = [ele["Key"] for ele in raw_response["Contents"] if raw_clean_map[dataset_name]["source_name"] in ele["Key"]]

    clean_response = s3.list_objects_v2(Bucket=clean_bucket_name, Prefix=raw_clean_map[dataset_name]["clean"])
    if "Contents" not in clean_response:
        print(f"Didn't find clean for {dataset_name}")
        return
    
    if isinstance(raw_clean_map[dataset_name]["source_name"], list):
        clean_response = [ele["Key"] for ele in clean_response["Contents"] if all(source in ele["Key"] for source in raw_clean_map[dataset_name]["source_name"])]
    else:
        clean_response = [ele["Key"] for ele in clean_response["Contents"] if raw_clean_map[dataset_name]["source_name"] in ele["Key"]]
    
    if both:
        if len(raw_response) == len(clean_response):
            print(f"Same number of files for raw and clean for {dataset_name}")
        else:
            print(f"The number of files for raw and clean in {dataset_name} differs")

    # Determine sample distribution across files
    # sample_distribution = distribute_samples_across_files(
    #     clean_response, n_samples
    # )
    sample_distribution = {random.choice(clean_response): n_sample}
    sampled_data = []
    sample_id = 1
    
    print(f"Starting to collect {n_samples} samples...")
    
    for file_key, target_samples in tqdm(sample_distribution.items()):
        if target_samples == 0:
            continue
            
        print(f"Collecting {target_samples} samples from: {file_key}")
        
        retries = 0
        collected_from_file = 0
        
        while collected_from_file < target_samples and retries < max_retries:
            try:
                # Read the data file
                df = read_data_from_s3(clean_bucket_name, file_key)
                
                if df.empty:
                    print(f"File {file_key} is empty, skipping...")
                    break
                
                # Calculate how many samples we still need from this file
                remaining_samples = target_samples - collected_from_file
                
                # Get the samples
                sampled_rows = get_random_samples(df, remaining_samples)
                
                if not sampled_rows.empty:
                    # Add metadata to each sample
                    for _, row in sampled_rows.iterrows():
                        row_dict = row.to_dict()
                        row_dict['_source_file'] = file_key
                        row_dict['_sample_id'] = sample_id
                        
                        sampled_data.append(row_dict)
                        sample_id += 1
                        collected_from_file += 1
                    
                    print(f"Successfully collected {len(sampled_rows)} samples from {file_key}")
                    break  # Successfully got samples, move to next file
                else:
                    print(f"No samples collected from {file_key} on attempt {retries + 1}")
                    
            except Exception as e:
                print(f"Error sampling from {file_key}: {e}")
            
            retries += 1
        
        if collected_from_file < target_samples:
            print(f"Only collected {collected_from_file}/{target_samples} samples from {file_key}")
    
    if len(sampled_data) == 0:
        print("No valid samples collected")
        return pd.DataFrame()
    
    result_df = pd.DataFrame(sampled_data)
    print(f"Successfully collected {len(result_df)}/{n_samples} total samples")
    
    return result_df

In [None]:
for key in raw_clean_map.keys():
    print(f"Dataset: {key}")
    sample_df = sample_dataset(key, n_samples=n_samples, both=False)
    sample_df.to_csv(f"./outputs/super_benchmark/{key}_random_{n_samples}.csv")

# Creating batch

In [None]:
PROJECT_ID = os.getenv("PROJECT_ID")
LOCATION = os.getenv("LOCATION")

print(PROJECT_ID, LOCATION)

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

In [None]:
storage_client = storage.Client(project=PROJECT_ID)
BUCKET_NAME = os.getenv("BUCKET_NAME")
bucket = storage_client.bucket(BUCKET_NAME)

In [None]:
def count_jsonl_lines(file_path):
    with open(file_path, 'r') as f:
        return sum(1 for line in f)

## Regex

In [None]:
system_prompt = """
You are an AI assistant tasked with analyzing the results of a text cleaning process for Hebrew language datasets.
We are building an LLM for Hebrew, and our current phase involves cleaning training data using a specific set of rules implemented in our code. 
Your role is to evaluate the outcome of this cleaning process by evaluating a sample of a cleaned text.

### Your Evaluation Task:
For the provided clean_text (text after cleaning), you must perform a thorough analysis. Please assess the following:

1. Overall Quality Score: Provide a general quality score for the clean_text after all the applied cleanings. [Score]
2. Comments: Offer specific observations, insights, and notes regarding the cleaning outcome.
3. Best Practice Recommendations: Suggest what appears to be most appropriate or effective, considering the cleaning rules.

### Data Cleaning Rules to Verify (as implemented by our code):
- HTML Escape Codes Replacement:
-- Replaces HTML escape codes like &quot;, &#34; with ".
-- Replaces &#39; with '.
-- Note: These replacements are explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- HTML Tags and CSS Removal:
-- Removes <style>...</style> blocks (content and tags).
-- Removes common HTML tags (e.g., <html>, <body>, <div>, <p>, <a>, <table>, <img>, headings, list items, etc.).
-- Note: This removal is explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- Newline and Whitespace Handling:
-- Removes empty lines (lines containing only whitespace) at the start/end of the line.
-- Replaces carriage returns (\r or \r\n) with a single newline (\n).
-- Replaces more than 3 consecutive newlines (\n{4,}) with exactly 3 newlines (\n\n\n).
-- Note: These operations are explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- Multiple Space Normalization:
-- Adjusts sequences of multiple spaces. While the code is complex, the info suggests it replaces multiple spaces to their "lower 4 multiplier" (e.g., 10 spaces to 8, 7 spaces to 4) up to a maximum of 16 spaces.
-- It also strips leading/trailing spaces from lines and removes extra spaces between words (reducing multiple spaces to single spaces).
-- Note: This normalization is explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- PII Deletion (IP and Email Addresses):
-- IP Addresses: Replaces IP addresses (e.g., 192.168.1.1) with <IP>, except for 127.0.0.1 and 0.0.0.0.
-- Email Addresses: Replaces email addresses (e.g., user@example.com) with <EMAIL>.
-- Note: These operations are explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- Long Separator Line Removal:
-- Removes entire lines that consist only of hyphens (-), equals signs (=), asterisks (*), underscores (_), tildes (~), or common bullet/block symbols (like •, ●, ■, ▪, ◆, ◦), followed by whitespace.
-- Note: This removal is explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- Hebrew Diacritics (Nikud) and Formatting Character Removal:
-- Removes Hebrew diacritics (Nikud) from U+0591 to U+05C7, specifically excluding U+05BE (Makaf, the Hebrew hyphen).
-- Replaces non-breaking spaces (\u00A0) with regular spaces ().
-- Removes unicode control characters for text directionality (Left-to-Right Mark, Right-to-Left Mark, Pop Directional Formatting).
-- Note: These operations are explicitly excluded from occurring within <MARKDOWN_TABLE> tags.

- Internal Markdown Table Tag Removal:
-- Removes the internal <MARKDOWN_TABLE> and </MARKDOWN_TABLE> tags that are used to protect table content from other cleaning rules.
-- Note: This is the final step for tables, implying that tables themselves are converted to markdown before these tags are removed, and protected during the process.

For each Score use the following rubric:
5 - Perfect cleaning. The text is cleaned fully according to the rules, is coherent and fully ready for LLM training.
4 - Very good cleaning. The text is cleaned mostly according to the rules, is coherent with minor disruprencies, but is ready for LLM training.
3 - Good cleaning. The text is cleaned according to the rules, but some rules have been ommited. The text is mostly coherent with some issues. The text is permissible for LLM training overall.
2 - OK cleaning. The text is cleaned according to some rules. It's somewhat coherent but should be used with caution.
1 - Unacceptable cleaning. The text is not cleaned according to the rules. The text is not coherent and is not acceptable for LLM training as is.

You will be provided with a clean_text. Analyze it against these rules and provide your feedback in the requested format.
"""

In [None]:
prompt = """
Provide the answer in a JSON format.
Note: some samples are too large and were truncated at the end.

Here is a sample to evaluate:
"""

In [None]:
response_schema = {
    "type": "OBJECT",
        "properties": {
            "score": {
                "type": "INTEGER",
                "minimum": 1,
                "maximum": 5,
                "description": "Rating score from 1 (unacceptable) to 5 (perfect)"
            },
            "comments": {
                "type": "STRING",
                "description": "Detailed feedback and observations"
                },
            "recommendations": {
                "type": "STRING",
                "description": "Actionable suggestions for improvement"},
        },
    "required": ["score", "comments", "recommendations"]
}

In [None]:
keys = os.listdir("./outputs/benchmark")
print(len(keys))

In [None]:
JSON_FILENAME = f"overall_benchmark_evaluation.jsonl"
jsonl_fn = f"./outputs/jsonls/{JSON_FILENAME}"
print(jsonl_fn)

In [None]:
MODEL_ID = "gemini-2.5-flash"
thresh = 10_000

In [None]:
generationConfig = {"temperature": 0, "maxOutputTokens": 8192,
                    "response_schema": response_schema, "response_mime_type":"application/json",
                    "thinkingConfig": {"thinkingBudget": 1_000}}

for key in tqdm(keys):
    df = pd.read_csv(f"./outputs/benchmark/{key}")

    with jsonlines.open(jsonl_fn, mode="a") as writer:
        
        for row_index, row in df.iterrows():
            # Extract data from CSV row
            sample_id = row.get('_sample_id', f"{row_index}")
            text = row.get('text', '')  # Adjust column name as needed

            if len(text) > thresh:
                print(f"{row_index} of {key} is too large ({len(text)}). Truncating to {thresh}")
                text = text[:thresh]

            gem_request = {
                            "custom_id": f"{key}_{sample_id}",
                            "request":{
                                "contents": [
                                    {
                                        "role": "user", 
                                        "parts": [
                                            {"text": prompt},
                                            {"text": text}
                                            ]
                                    }
                                ], 
                                "generationConfig":generationConfig,
                                "system_instruction": {"parts": [{"text": system_prompt}]}
                            }
                            }
            writer.write(gem_request)

In [None]:
count_jsonl_lines(jsonl_fn)

In [None]:
blob = bucket.blob(JSON_FILENAME)
blob.upload_from_filename(jsonl_fn)

In [None]:
INPUT_DATA = f"gs://{BUCKET_NAME}/{JSON_FILENAME}"
DEST_BUKCET_URI = "gs://tau_ocr_results"

gcs_batch_job = client.batches.create(
    model=MODEL_ID,
    src=INPUT_DATA,
    config=CreateBatchJobConfig(dest=DEST_BUKCET_URI),
)
gcs_batch_job.name

In [None]:
gcs_batch_job = client.batches.get(name=gcs_batch_job.name)
# Refresh the job until complete
while not gcs_batch_job.state in ["JOB_STATE_SUCCEEDED", "JOB_STATE_FAILED", "JOB_STATE_UNEXECUTED"]:
    time.sleep(5)
    gcs_batch_job = client.batches.get(name=gcs_batch_job.name)

if gcs_batch_job.state == "JOB_STATE_SUCCEEDED":
# Check if the job succeeds
    print("Job succeeded!")
else:
    print(f"Job failed: {gcs_batch_job.error}")

# Collecting batch results

In [None]:
def ensure_candidates(response):
    if not isinstance(response, dict):
        return {'candidates': []}
    if 'candidates' not in response:
        response['candidates'] = []
    return response
    
def extract_response_text(response):
    try:
        return response[0]['text']
    except (AttributeError, TypeError):
        return None

In [None]:
def parse_json_response(json_string):
    """Parse JSON response string and extract key fields"""
    try:
        data = json.loads(json_string)
        return pd.Series({
            'score': data.get('score'),
            'comments': data.get('comments'),
            'recommendations': data.get('recommendations'),
        })
    except (json.JSONDecodeError, TypeError):
        return pd.Series({
            'score': None,
            'comments': None,
            'recommendations': None,
        })

In [None]:
def extract_text_from_request(row):
    """Safely extract text from nested request structure"""
    try:
        return row["request"]["contents"][0]["parts"][1]["text"]
    except (KeyError, IndexError, TypeError):
        return None  # or return "" for empty string

In [None]:
fs = fsspec.filesystem("gcs")

file_paths = fs.glob(f"{gcs_batch_job.dest.gcs_uri}/*/predictions.jsonl")
print(file_paths)

In [None]:
if gcs_batch_job.state == "JOB_STATE_SUCCEEDED":
    # Load the JSONL file into a DataFrame
    df = pd.read_json(f"gs://{file_paths[-1]}", lines=True)
    df['response'] = df['response'].apply(ensure_candidates)
    df = df.join(pd.json_normalize(df["response"], "candidates"))

    df["response_text"] = df["content.parts"].apply(extract_response_text)

    # Apply the parsing function to create new columns
    df[['score', 'comments', 'recommendations']] = df['response_text'].apply(parse_json_response)

    df["source"] = df["custom_id"].apply(lambda x: x.split("_random_")[0])
    df["sample_id"] = df["custom_id"].apply(lambda x: x.split("_")[-1])

    df["sample_text"] = df.apply(extract_text_from_request, axis=1)
    
    display(df)

In [None]:
df["score"].mean()

In [None]:
columns_to_save = ["sample_id", "sample_text", "score", "comments", "recommendations"]

In [None]:
def save_results_to_excel(df, output_excel_path):
    """Save DataFrame to Excel with separate sheets for each source"""
    
    with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
        
        # Group by source and save each group to a separate sheet
        for source, group_df in df.groupby('source'):
            
            # Clean sheet name (Excel has restrictions)
            clean_sheet_name = str(source).replace('/', '_').replace('\\', '_')[:31]
            print(clean_sheet_name)

            # Save to sheet
            selected_df = group_df[columns_to_save].reset_index(drop=True)
            selected_df.to_excel(writer, sheet_name=clean_sheet_name, index=False)
            
            print(f"Saved {len(group_df)} records to sheet '{clean_sheet_name}'")

# Usage
save_results_to_excel(df, './outputs/results/overall_benchmark_results.xlsx')