In [None]:
!pip install transformers accelerate torch --quiet
!pip install -q huggingface_hub
!pip install optimum gptqmodel

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch



In [None]:
!pip install -U bitsandbytes accelerate



In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

model_name = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,
    num_labels=2
)


df = pd.read_parquet("/content/samples_3k_project_c_updated.parquet")

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

train_df.to_json("train.jsonl", orient="records", lines=True)
test_df.to_json("test.jsonl", orient="records", lines=True)

train_data = load_dataset("json", data_files="train.jsonl", split="train")
test_data = load_dataset("json", data_files="test.jsonl", split="train")

def tokenize_data(data):
    prompt = (
        f"--- Record A ---\n"
        f"Source: {data['sources']}\n"
        f"Name: {data['names']}\n"
        f"Category: {data['categories']}\n"
        f"Website: {data['websites']}\n"
        f"Socials: {data['socials']}\n"
        f"Emails: {data['emails']}\n"
        f"Phones: {data['phones']}\n"
        f"Brand: {data['brand']}\n"
        f"Addresses: {data['addresses']}\n\n"

        f"--- Record B (Base) ---\n"
        f"Source: {data['base_sources']}\n"
        f"Name: {data['base_names']}\n"
        f"Category: {data['base_categories']}\n"
        f"Website: {data['base_websites']}\n"
        f"Socials: {data['base_socials']}\n"
        f"Emails: {data['base_emails']}\n"
        f"Phones: {data['base_phones']}\n"
        f"Brand: {data['base_brand']}\n"
        f"Addresses: {data['base_addresses']}\n\n"

        f"Question: Are Record A and Record B referring to the same entity?\n"
        f"Answer:"
    )
    tokenized = tokenizer(prompt, max_length=256, truncation=True)
    tokenized["labels"] = int(data["label"])
    tokenized["id"] = data["id"]
    return tokenized

train_data = train_data.map(tokenize_data)
test_data = test_data.map(tokenize_data)

def fix_data(data):
    data["labels"] = int(data["labels"])
    return data

train_data = train_data.map(fix_data)
test_data = test_data.map(fix_data)
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

peft_config = LoraConfig(r=8, target_modules=["qkv_proj", "o_proj"], task_type="SEQ_CLS")
model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

Meta's LLama

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_name = "meta-llama/Llama-3.2-1B"

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

tokenizer = AutoTokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": device},
    load_in_4bit=True,
    num_labels=2
)

df = pd.read_parquet("/content/samples_3k_project_c_updated.parquet")

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

train_df.to_json("train.jsonl", orient="records", lines=True)
test_df.to_json("test.jsonl", orient="records", lines=True)

train_data = load_dataset("json", data_files="train.jsonl", split="train")
test_data = load_dataset("json", data_files="test.jsonl", split="train")

def tokenize_data(data):
    prompt = (
        f"--- Record A ---\n"
        f"Source: {data['sources']}\n"
        f"Name: {data['names']}\n"
        f"Category: {data['categories']}\n"
        f"Website: {data['websites']}\n"
        f"Socials: {data['socials']}\n"
        f"Emails: {data['emails']}\n"
        f"Phones: {data['phones']}\n"
        f"Brand: {data['brand']}\n"
        f"Addresses: {data['addresses']}\n\n"

        f"--- Record B (Base) ---\n"
        f"Source: {data['base_sources']}\n"
        f"Name: {data['base_names']}\n"
        f"Category: {data['base_categories']}\n"
        f"Website: {data['base_websites']}\n"
        f"Socials: {data['base_socials']}\n"
        f"Emails: {data['base_emails']}\n"
        f"Phones: {data['base_phones']}\n"
        f"Brand: {data['base_brand']}\n"
        f"Addresses: {data['base_addresses']}\n\n"

        f"Question: Are Record A and Record B referring to the same entity?\n"
        f"Answer:"
    )
    tokenized = tokenizer(prompt, max_length=256, truncation=True)
    tokenized["labels"] = int(data["label"])
    tokenized["id"] = data["id"]
    return tokenized

train_data = train_data.map(tokenize_data)
test_data = test_data.map(tokenize_data)

def fix_data(data):
    data["labels"] = int(data["labels"])
    return data

train_data = train_data.map(fix_data)
test_data = test_data.map(fix_data)
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

model.to(device)
peft_config = LoraConfig(r=8, target_modules=["query", "key", "value"], task_type="SEQ_CLS")
model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Google's Electra

In [None]:
model_name = "google/electra-small-discriminator"

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

tokenizer = AutoTokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": device},
    # load_in_4bit=True,
    num_labels=2
)

df = pd.read_parquet("/content/samples_3k_project_c_updated.parquet")

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

train_df.to_json("train.jsonl", orient="records", lines=True)
test_df.to_json("test.jsonl", orient="records", lines=True)

train_data = load_dataset("json", data_files="train.jsonl", split="train")
test_data = load_dataset("json", data_files="test.jsonl", split="train")

def tokenize_data(data):
    prompt = (
        f"--- Record A ---\n"
        f"Source: {data['sources']}\n"
        f"Name: {data['names']}\n"
        f"Category: {data['categories']}\n"
        f"Website: {data['websites']}\n"
        f"Socials: {data['socials']}\n"
        f"Emails: {data['emails']}\n"
        f"Phones: {data['phones']}\n"
        f"Brand: {data['brand']}\n"
        f"Addresses: {data['addresses']}\n\n"

        f"--- Record B (Base) ---\n"
        f"Source: {data['base_sources']}\n"
        f"Name: {data['base_names']}\n"
        f"Category: {data['base_categories']}\n"
        f"Website: {data['base_websites']}\n"
        f"Socials: {data['base_socials']}\n"
        f"Emails: {data['base_emails']}\n"
        f"Phones: {data['base_phones']}\n"
        f"Brand: {data['base_brand']}\n"
        f"Addresses: {data['base_addresses']}\n\n"

        f"Question: Are Record A and Record B referring to the same entity?\n"
        f"Answer:"
    )
    tokenized = tokenizer(prompt, max_length=256, truncation=True)
    tokenized["labels"] = int(data["label"])
    tokenized["id"] = data["id"]
    return tokenized

train_data = train_data.map(tokenize_data)
test_data = test_data.map(tokenize_data)

def fix_data(data):
    data["labels"] = int(data["labels"])
    return data

train_data = train_data.map(fix_data)
test_data = test_data.map(fix_data)
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# # model.to(device)
# peft_config = LoraConfig(r=8, target_modules=["query", "key", "value"], task_type="SEQ_CLS")
# model = get_peft_model(model, peft_config)
# model.gradient_checkpointing_enable()

In [None]:
# tokenizer.pad_token = tokenizer.eos_token
# model.config.pad_token_id = tokenizer.pad_token_id

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "test-trainer",
    report_to="none",
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
500,0.6353


TrainOutput(global_step=789, training_loss=0.6070514093938134, metrics={'train_runtime': 69.4125, 'train_samples_per_second': 90.762, 'train_steps_per_second': 11.367, 'total_flos': 92671868620800.0, 'train_loss': 0.6070514093938134, 'epoch': 3.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/MyDrive/electra")
tokenizer.save_pretrained("/content/drive/MyDrive/electra")

Mounted at /content/drive


('/content/drive/MyDrive/electra/tokenizer_config.json',
 '/content/drive/MyDrive/electra/special_tokens_map.json',
 '/content/drive/MyDrive/electra/vocab.txt',
 '/content/drive/MyDrive/electra/added_tokens.json',
 '/content/drive/MyDrive/electra/tokenizer.json')

In [None]:
import numpy as np
# testing data
test_data = test_data.map(tokenize_data)
results = trainer.predict(test_data)
# print(results.label_ids)
argmax_preds = torch.argmax(torch.tensor(results.predictions), dim=-1).numpy()
print(argmax_preds[:10])
# print(argmax_preds)
print(f"Accuracy: {np.mean(results.label_ids == argmax_preds)}")

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

[0 1 0 0 1 1 1 0 0 1]
Accuracy: 0.7188888888888889
