In [1]:
!pip install datasets transformers peft

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
import pandas as pd
from transformers import BertTokenizer
from datasets import Dataset

# Load the dataset
dataset = load_dataset('SeyedAli/Persian-Text-Sentiment')

# Convert to pandas DataFrame and select 600 records in total
df = pd.DataFrame(dataset['train'][:600])

# Split into train and eval datasets
train_df = df[:500]
eval_df = df[500:]

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')

# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Apply the preprocessing function
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/524 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/55852 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/13964 [00:00<?, ? examples/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', num_labels=2)

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],  # Change target modules to 'query' and 'value'
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # Specify the task type for sequence classification
)

# Apply the LoRA configuration
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",  # Change save strategy to match evaluation strategy
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# Fine-tuning
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7008,0.692058
2,0.6855,0.683455
3,0.6521,0.676337
4,0.6646,0.673279
5,0.6829,0.672478




TrainOutput(global_step=315, training_loss=0.6819728805905296, metrics={'train_runtime': 1826.8006, 'train_samples_per_second': 1.369, 'train_steps_per_second': 0.172, 'total_flos': 165013593600000.0, 'train_loss': 0.6819728805905296, 'epoch': 5.0})

In [7]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/fine-tuning-project/task2")
tokenizer.save_pretrained("/content/drive/MyDrive/fine-tuning-project/task2")



('/content/drive/MyDrive/fine-tuning-project/task2/tokenizer_config.json',
 '/content/drive/MyDrive/fine-tuning-project/task2/special_tokens_map.json',
 '/content/drive/MyDrive/fine-tuning-project/task2/vocab.txt',
 '/content/drive/MyDrive/fine-tuning-project/task2/added_tokens.json')

In [11]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/fine-tuning-project/task2")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/fine-tuning-project/task2")

# Sample input text
sample_text = "جوجه کباب خیلی پخته و خشک شده بود"

# Tokenize the sample text
inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Map the prediction to the sentiment label
label_map = {0: "negative", 1: "positive"}
predicted_label = label_map[predictions.item()]

print(f"Input Text: {sample_text}")
print(f"Predicted Sentiment: {predicted_label}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input Text: جوجه کباب خیلی پخته و خشک شده بود
Predicted Sentiment: negative
