In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None

file_path = '/content/drive/MyDrive/Colab Notebooks/df_pairs_final.csv'
df = pd.read_csv(file_path, sep=';')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "NECOUDBFM/Jellyfish-13B"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Kontrollime, kas CUDA on saadaval ja kasutame seda
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",  # Automaatne seadistus (nt GPU + RAM)
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,  # Kasuta vähem RAMi laadimise ajal
    offload_folder="./jellyfish_offload"  # TEMP kaust, kuhu salvestatakse suurimad mudelikomponendid
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.4k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle
import torch
import os

# Define the models to compare
models = [
    "bgn",
    "dstu_a",
    "dstu_b",
    "eki",
    "gost_b",
    "icao",
    "iso9",
    "OS_ascii",
    "rt_translit"
]

# Lisa pad_token kui puudub
tokenizer.pad_token = tokenizer.eos_token

# === BATCH-INFERENCE FUNKTSIOON KOOS PROGRESSI JÄLGIMISEGA === #
def compare_names_batch(name1_list, name2_list, batch_size=10):
    results = []
    raw_outputs = []
    total = len(name1_list)

    prompts = [
        "You are tasked with determining whether two records listed below refer to the same person based on the information provided.\n"
        "Carefully compare the name fields for each record before making your decision.\n"
        "Note: Structural inconsistencies in names, such as variations in name order, the presence or absence of middle names, or the inclusion of honorific titles and prefixes, should not automatically imply a mismatch.\n"
        #"Missing or abbreviated parts (e.g., initials, missing first names) should be treated cautiously.\n\n"
        f"Record A: [name: {a}]\n"
        f"Record B: [name: {b}]\n\n"
        "Are record A and record B the same entity? Choose your answer from: [No, Yes]."
        for a, b in zip(name1_list, name2_list)
    ]

    for i in tqdm(range(0, total, batch_size), desc="LLM batch inference"):
        print(f"Processing name pairs {i + 1}–{min(i + batch_size, total)} of {total}")
        batch_prompts = prompts[i:i + batch_size]
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for response in decoded:
            raw_outputs.append(response)
            lines = response.strip().splitlines()
            last_line = lines[-1].strip().lower()

            if last_line == "yes":
                results.append(True)
            elif last_line == "no":
                results.append(False)
            else:
                results.append(False)

    return results, raw_outputs

# === Salvesta asukoht === #
save_dir = '/content/drive/MyDrive/Colab Notebooks/chunk_basic_basic/'
os.makedirs(save_dir, exist_ok=True)

# Sample balanced dataset
df_sample = pd.concat([
    df[df["label"] == True].sample(5000, random_state=42),
    df[df["label"] == False].sample(5000, random_state=42)
]).sample(frac=1, random_state=123).reset_index(drop=True) #kõik read sega omavahel läbi

chunks = [df_sample[i:i + 1000] for i in range(0, len(df_sample), 1000)]

all_true_labels = []
all_predictions = {model_name: [] for model_name in models}

# === CHUNKI TÖÖTLUS === #
for idx, chunk in enumerate(chunks):
    print(f"\n🔹 Processing chunk {idx + 1}/{len(chunks)}")

    for model_idx, model_name in enumerate(models):
        names1 = chunk["name_final"].tolist()
        names2 = chunk[model_name].tolist()

        match_results, raw_outputs = compare_names_batch(names1, names2)
        chunk[model_name + "_match"] = match_results
        chunk[model_name + "_raw_response"] = raw_outputs

        if model_idx == 0:
            all_true_labels.extend(chunk["label"].tolist())

        all_predictions[model_name].extend(match_results)

        assert len(all_true_labels) == len(all_predictions[model_name]), \
            f"Length mismatch: labels={len(all_true_labels)}, predictions={len(all_predictions[model_name])}"

    # Save this chunk to CSV
    csv_path = os.path.join(save_dir, f'processed_{idx + 1}.csv')
    try:
        chunk.to_csv(csv_path, index=False)
        print(f"✅ Chunk {idx + 1} salvestatud: {csv_path}")
    except Exception as e:
        print(f"❌ Salvestamine ebaõnnestus chunk {idx + 1} puhul: {e}")

# === LÕPP-SALVESTUS (.PKL) === #
try:
    with open(os.path.join(save_dir, 'all_true_labels.pkl'), 'wb') as f:
        pickle.dump(all_true_labels, f)
    print("✅ all_true_labels.pkl salvestatud")

    with open(os.path.join(save_dir, 'all_predictions.pkl'), 'wb') as f:
        pickle.dump(all_predictions, f)
    print("✅ all_predictions.pkl salvestatud")
except Exception as e:
    print(f"❌ .pkl failide salvestus ebaõnnestus: {e}")

# === LÕPP-TULEMUSED === #
final_results = []
for model_name in models:
    precision = precision_score(all_true_labels, all_predictions[model_name], zero_division=0)
    recall = recall_score(all_true_labels, all_predictions[model_name], zero_division=0)
    f1 = f1_score(all_true_labels, all_predictions[model_name], zero_division=0)
    final_results.append({
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

results_df = pd.DataFrame(final_results)
print(results_df)

Output hidden; open in https://colab.research.google.com to view.