task3 - named entity recognition

In [2]:
!pip install transformers peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26

In [3]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from peft import PeftModel, PeftConfig

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-multilingual-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")

# Verify and correct the path to the LoRA adapters
adapter_path = "/content/drive/MyDrive/fine-tuning-project/task3"  # Double-check this path
peft_config = PeftConfig.from_pretrained(adapter_path)

model = PeftModel.from_pretrained(model, adapter_path,  load_in_8bit=True, device_map='auto')

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [5]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Ensure the label mappings are set correctly
model.config.id2label = id2label
model.config.label2id = label2id

In [6]:
def align_predictions(predictions, inputs):
    aligned_labels = []
    word_ids = inputs.word_ids(batch_index=0)

    current_word_id = None
    current_label = None
    for word_id, prediction in zip(word_ids, predictions):
        if word_id != current_word_id:
            if current_label is not None:
                aligned_labels.append(model.config.id2label[current_label])
            current_word_id = word_id
            current_label = prediction
        elif word_id is not None:
            if model.config.id2label[prediction].startswith('I-'):
                current_label = prediction
            elif model.config.id2label[prediction] != 'O':
                current_label = prediction

    if current_label is not None:
        aligned_labels.append(model.config.id2label[current_label])

    return aligned_labels

def visualize_predictions(text, aligned_labels):
    inputs = tokenizer(text, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
    word_ids = inputs.word_ids(batch_index=0)

    current_word = ""
    current_label = None
    for token, word_id in zip(tokens, word_ids):
        if word_id is not None:
            if current_word and word_id != current_label:
                print(f"{current_word} -> {aligned_labels[current_label]}")
                current_word = ""
            current_word += token.replace("##", "")
            current_label = word_id
    if current_word:
        print(f"{current_word} -> {aligned_labels[current_label]}")


In [7]:
import torch

# Define a sample input
sample_input = " شهرستان بوکان"

# Tokenize the input
inputs = tokenizer(sample_input, return_tensors="pt")

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=2).squeeze().tolist()

# Align predictions with tokens
aligned_labels = align_predictions(predictions, inputs)

# Visualize the output
visualize_predictions(sample_input, aligned_labels)


شهرستان -> I-LOC
بوکان -> I-LOC


شهرستان بوکان
استان آذربایجان غربی
محمد بن ثانی آل ثانی
سعدی
باغ وحش و آکواریوم سن آنتونیو
سازمان میراث فرهنگی، صنایع دستی و گردشگری


task4 - fill mask

In [8]:
from transformers import AutoConfig, AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig

config = AutoConfig.from_pretrained("HooshvareLab/bert-fa-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")

# Verify and correct the path to the LoRA adapters
adapter_path = "/content/drive/MyDrive/fine-tuning-project/task4"  # Double-check this path
peft_config = PeftConfig.from_pretrained(adapter_path)

model = PeftModel.from_pretrained(model, adapter_path,  load_in_8bit=True, device_map='auto')



config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

In [21]:
from transformers import AutoModelForMaskedLM

# Load the model specifically for masked language modeling
base_model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/bert-fa-base-uncased")

# Prepare a sample input sentence with a mask token
sample_input = "سلام، [MASK] می‌توانم از مدل BERT استفاده کنم؟"

# Tokenize the input
inputs = tokenizer(sample_input, return_tensors='pt')

# Move tensors to the appropriate device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = {key: value.to(device) for key, value in inputs.items()}

# Move the model to the device
model.to(device)

# Set the model to evaluation mode
model.eval()

# Get the model output
with torch.no_grad():
    outputs = model(**inputs)

# Get the logits for the masked token position
mask_token_indices = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# Extract last_hidden_state and pass it through the language modeling head
last_hidden_state = outputs.last_hidden_state

# Apply the language modeling head manually (note: this is model-specific)
logits = base_model.cls(last_hidden_state)  # base_model.cls is typically the MLM head for BERT

mask_token_logits = logits[0, mask_token_indices, :]

# Get the top 5 tokens predicted for the masked position
top_k = 5
top_k_scores, top_k_tokens = torch.topk(mask_token_logits, top_k, dim=-1)


# Convert the tokens to words and print the tokens with their scores
predicted_tokens_and_scores = [(tokenizer.decode([token]), score.item()) for token, score in zip(top_k_tokens[0], top_k_scores[0])]

for token, score in predicted_tokens_and_scores:
    print(f"Predicted token: {token}, Score: {score}")

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted token: ایا, Score: 15.147441864013672
Predicted token: چطور, Score: 14.248228073120117
Predicted token: چگونه, Score: 14.200218200683594
Predicted token: من, Score: 11.0946683883667
Predicted token: چطوری, Score: 10.851140975952148


[sample inputs](https://huggingface.co/datasets/Msobhi/virgool_62k/viewer/default/train?f[tags][value]=%27%D8%AE%D9%88%D8%AF%D8%B4%D9%86%D8%A7%D8%B3%DB%8C%27)
