In [4]:

from datasets import load_dataset
import pandas as pd
import csv
import re
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"



In [5]:
########################
# author: Rohit Mishra #
########################



def parse_sample(sample):
    text = sample['text']
    input_match = re.search(r'INPUT:\s*(.*?)\s*Question:', text, re.DOTALL)
    input_text = input_match.group(1).strip() if input_match else ""
    return {
        'primary_key': sample['id'],
        'text': input_text,
    }

def load_dataset_():
    dataset = load_dataset("YBXL/PubmedQA_train", split="train")
    samples = dataset.select(range(1000))
    parsed_data = []
    for sample in samples:
        parsed = parse_sample(sample)
        parsed_data.append(parsed)
    df = pd.DataFrame(parsed_data)

    output_file = '/content/pubmedqa_1000_samples.csv'
    print(f"\nSaving to {output_file}...")
    df.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

load_dataset_()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/623 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/179M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/866k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/866k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]


Saving to /content/pubmedqa_1000_samples.csv...


In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login
df = pd.read_csv('/content/pubmedqa_1000_samples.csv')

from huggingface_hub import login


HF_TOKEN = ""
login(token=HF_TOKEN)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

results = []

for idx, row in df.iterrows():
    full_text = row['text']

    # Tokenize
    tokens = tokenizer.encode(full_text, add_special_tokens=False)

    # Split: first 50 tokens = prefix, next 500 = suffix
    prefix_tokens = tokens[:500]
    suffix_tokens = tokens[500:]  # tokens 50 to 550
    if len(suffix_tokens)>=50:
        # Convert back to text
        prefix_text = tokenizer.decode(prefix_tokens)
        suffix_text = tokenizer.decode(suffix_tokens)

        results.append({
            'sample_id': row['primary_key'],
            'prefix': prefix_text,  # First 50 tokens
            'suffix': suffix_text  # Next 500 tokens
        })

# Save
df_new = pd.DataFrame(results)
df_new.to_csv('/content/pubmedqa_split.csv', index=False)
print(f"Done! Created {len(df_new)} samples")

'''

'''




Done! Created 18 samples


'\n\n'

In [8]:
pip install -U vllm

Collecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer==0.11.3 (from vllm)
  Downloading lm_format_enforcer-0.11.3-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.11 (from vllm)
  Downloading llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting outlines_core==0.2.11 (from vllm)
  Downloading outlines_core-0.2.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting diskcache==5.6.3 (from vllm)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting lark==1.2.2 (from vllm)
  Downloading lark-1.2.2-py3-none-any.whl.metada

In [1]:
import pandas as pd
from tqdm import tqdm
from vllm import LLM, SamplingParams
from huggingface_hub import login
import os

# Login to Hugging Face
hf_token = "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"
if hf_token:
    login(token=hf_token)
else:
    raise ValueError("Set HUGGINGFACE_HUB_TOKEN environment variable")

# Load model
print("Loading LLaMA 3...")
llm = LLM(
    model="meta-llama/Meta-Llama-3-8B",
    dtype="bfloat16",
    gpu_memory_utilization=0.8,
    max_model_len=2048,
)

sampling_params = SamplingParams(temperature=0.0, top_k=1, max_tokens=512)

print("Loading data...")
df = pd.read_csv('/content/pubmedqa_split.csv')
print(f"Loaded {len(df)} samples")

results = []
batch_size = 16
batch_inputs = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompt = f"Continue this medical text:\n\n{row['prefix']}"
    batch_inputs.append((idx, prompt, row))

    if len(batch_inputs) == batch_size or idx == len(df) - 1:
        prompts = [item[1] for item in batch_inputs]

        outputs = llm.generate(prompts, sampling_params)

        for i, output in enumerate(outputs):
            original_idx, _, original_row = batch_inputs[i]
            results.append({
                'sample_id': original_row['sample_id'],
                'prefix': original_row['prefix'],
                'suffix': original_row['suffix'],
                'generated': output.outputs[0].text.strip()
            })

        batch_inputs = []

df_results = pd.DataFrame(results)
df_results.to_csv('/content/results.csv', index=False)


INFO 11-03 16:53:25 [__init__.py:216] Automatically detected platform cuda.
Loading LLaMA 3...
INFO 11-03 16:53:34 [utils.py:233] non-default args: {'dtype': 'bfloat16', 'max_model_len': 2048, 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'model': 'meta-llama/Meta-Llama-3-8B'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 11-03 16:53:36 [model.py:547] Resolved architecture: LlamaForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-03 16:53:36 [model.py:1510] Using max model len 2048
INFO 11-03 16:53:39 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-03 16:54:25 [llm.py:306] Supported_tasks: ['generate']
Loading data...
Loaded 18 samples


  0%|          | 0/18 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

 89%|████████▉ | 16/18 [00:06<00:00,  2.31it/s]

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100%|██████████| 18/18 [00:13<00:00,  1.38it/s]

✅ Done! Saved to results.csv





In [4]:
import pandas as pd
from tqdm import tqdm
import os
from transformers import AutoTokenizer



TOKENIZER = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")


def tokenize_text(text: str) -> list[str]:
    if pd.isna(text):
        return []
    return TOKENIZER.tokenize(str(text))

def check_similarity(groundtruth: str, response: str, n: int = 50) -> tuple[bool, str | None]:

    gt_tokens = tokenize_text(groundtruth)
    res_tokens = tokenize_text(response)

    gt_len = len(gt_tokens)
    res_len = len(res_tokens)

    if gt_len < n or res_len < n:
        return False, None

    for i in range(gt_len - n + 1):
        gt_ngram = gt_tokens[i:i+n]

        for j in range(res_len - n + 1):
            res_ngram = res_tokens[j:j+n]

            if gt_ngram == res_ngram:
                matched_string = ' '.join(gt_ngram)
                return True, matched_string

    return False, None

def process_similarity(input_file: str, output_file: str, n: int = 50):


    try:
        df = pd.read_csv(input_file, low_memory=False)
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_file}'.")
        return

    match_count = 0

    GROUNDTRUTH_KEY = "suffix"
    RESPONSE_KEY = "generated"
    SIMILARITY_COLUMN = "similarity"
    MATCHED_STRING_COLUMN = f"matched_string_{n}"


    tqdm.pandas(desc=f"Checking for {n}-token matches")

    def apply_similarity_check(row):
        try:
            groundtruth = row[GROUNDTRUTH_KEY]
            response = row[RESPONSE_KEY]
        except KeyError as e:
            print(f"Skipping row due to missing key: {e}. Ensure keys are '{GROUNDTRUTH_KEY}' and '{RESPONSE_KEY}'.")
            return (False, None)

        is_similar, matched_string = check_similarity(groundtruth, response, n)

        if is_similar:
            nonlocal match_count
            match_count += 1

        return is_similar, matched_string

    df[[SIMILARITY_COLUMN, MATCHED_STRING_COLUMN]] = df.progress_apply(
        apply_similarity_check,
        axis=1,
        result_type='expand'
    )


    print(f"Total entries processed: {len(df)}")
    print(f"Total entries with at least {n} consecutive matching tokens: {match_count}")
    print(f"Percentage of matched entries: {(match_count / len(df) * 100):.2f}%")

    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    df.to_csv(output_file, index=False)

    print(f"Results saved to: {output_file}")

if __name__ == "__main__":
    input_file = '/content/results.csv'

    output_file = 'results/llama3_similarity_check_50.csv'

    N_TOKENS = 10

    process_similarity(input_file, output_file, N_TOKENS)


Checking for 10-token matches: 100%|██████████| 18/18 [00:00<00:00, 134.66it/s]


--- Summary ---
Total entries processed: 18
Total entries with at least 10 consecutive matching tokens: 4
Percentage of matched entries: 22.22%
Results saved to: results/llama3_similarity_check_50.csv



