In [1]:
!pip install transformers datasets evaluate accelerate
!pip install parsivar



In [2]:
# import torch
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

In [3]:
!pip install parsivar



In [4]:
import pandas as pd
from parsivar import Normalizer
import re

# Read the TSV file into a DataFrame
df = pd.read_csv('train.tsv', sep='\t', header=None, names=['Sentence', 'Label'], encoding='utf-8')
df_test = pd.read_csv('test.tsv', sep='\t', header=None, names=['Sentence', 'Label'], encoding='utf-8')

# Initialize Parsivar normalizer
normalizer = Normalizer()

# Function to perform additional pre-processing steps
def additional_preprocessing(text):
    # Remove English characters
    text = re.sub(r'[a-zA-Z]', '', text)

    # Remove repeated letters more than twice in non-standard Persian words
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # Remove Arabic diacritics
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)

    # Remove remaining non-Persian characters
    text = re.sub(r'[^آ-ی۰-۹\s]', '', text)

    # Remove hashtag sign while preserving hashtag information
    text = re.sub(r'#(\w+)', r'\1', text)

    # Remove Persian numeric characters
    text = re.sub(r'[۰-۹]', '', text)

    return text

# Apply pre-processing to the 'Sentence' column
df['Sentence'] = df['Sentence'].apply(normalizer.normalize)
df['Sentence'] = df['Sentence'].apply(additional_preprocessing)
df = df.rename(columns={'Sentence': 'text', 'Label': 'label'})

df_test['Sentence'] = df_test['Sentence'].apply(normalizer.normalize)
df_test['Sentence'] = df_test['Sentence'].apply(additional_preprocessing)
df_test = df_test.rename(columns={'Sentence': 'text', 'Label': 'label'})

# Display the DataFrame
print(df)


                                                   text     label
0     خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...       SAD
1        از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم      HATE
2      کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ...       SAD
3     چون همش با دوربین ثبتشده ایا میشه اعتراض زد  و...     OTHER
4                     این وضع ب طرز خندهداری گریه داره        SAD
...                                                 ...       ...
6120  مرحوم پیشبینی آبکی زیاد میکرد مرحوم عجب آینده ...  SURPRISE
6121  کلا عین اعتقادات و توئیت زدناتون   در قبال ران...     ANGRY
6122  خب وقتی میگی کسی بیاد مارو بگیره یارو ترس میکن...      FEAR
6123  همون هارو مگه آهنگ جدیدای خوانندههای دهه پنجاه...  SURPRISE
6124                             نیم دگیرش چطور حل نیشد     OTHER

[6125 rows x 2 columns]


In [5]:
df

Unnamed: 0,text,label
0,خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...,SAD
1,از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم,HATE
2,کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ...,SAD
3,چون همش با دوربین ثبتشده ایا میشه اعتراض زد و...,OTHER
4,این وضع ب طرز خندهداری گریه داره,SAD
...,...,...
6120,مرحوم پیشبینی آبکی زیاد میکرد مرحوم عجب آینده ...,SURPRISE
6121,کلا عین اعتقادات و توئیت زدناتون در قبال ران...,ANGRY
6122,خب وقتی میگی کسی بیاد مارو بگیره یارو ترس میکن...,FEAR
6123,همون هارو مگه آهنگ جدیدای خوانندههای دهه پنجاه...,SURPRISE


In [6]:
df_test

Unnamed: 0,text,label
0,این شاید اولین عزای عمومی واقعی است که یاد دار...,SAD
1,دیشب بعد از ارسال تویت مربوط به آثار باستانی ت...,HAPPY
2,کدوم شعبه پول نداده بگو الان برات آمار دقیق ب...,OTHER
3,امروز وسط یه بحث با بابا مامانم گفتم آدم باید ...,HAPPY
4,امشب گفت نامزدی دوستش که ادم روشنفکری است بهم ...,SAD
...,...,...
1146,یعنی این آهنگ مرغ سحر جوری ساختهشدهو روی این ش...,HAPPY
1147,درود بر همه ایرانیان شریف که در این سرما در تظ...,HAPPY
1148,امروز تولدم است عید است ولی شاد نیستم عید و ...,SAD
1149,لعنت به اونی که دلتنگ نگهت میداره,SAD


In [7]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df_test['label'] = label_encoder.transform(df_test['label'])

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
# model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [9]:
dataset = Dataset.from_pandas(df)
dataset_test = Dataset.from_pandas(df_test)

In [10]:
dataset[0]

{'text': 'خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام پس بدم', 'label': 5}

In [11]:
dataset[0]

{'text': 'خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام پس بدم', 'label': 5}

In [12]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 6125
})

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True , padding=True)

In [14]:
dataset_yok = dataset.map(preprocess_function, batched=True)
dataset_test_yok = dataset_test.map(preprocess_function, batched=True)


Map:   0%|          | 0/6125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1151 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
print("Label encoder classes:", label_encoder.classes_)

Label encoder classes: ['ANGRY' 'FEAR' 'HAPPY' 'HATE' 'OTHER' 'SAD' 'SURPRISE']


In [19]:
id2label = {
    0: "ANGRY",
    1: "FEAR",
    2: "HAPPY",
    3: "HATE",
    4: "OTHER",
    5: "SAD",
    6: "SURPRISE"
}

label2id = {label: id for id, label in id2label.items()}

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base", num_labels=7, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.05,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_yok,
    eval_dataset=dataset_test_yok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.601365,0.371851
2,1.691700,1.186528,0.595135
3,1.104600,1.063597,0.647263
4,0.902900,1.056085,0.660295
5,0.902900,1.012143,0.679409


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.601365,0.371851
2,1.691700,1.186528,0.595135
3,1.104600,1.063597,0.647263
4,0.902900,1.056085,0.660295
5,0.902900,1.012143,0.679409
6,0.765000,0.967625,0.707211
7,0.669600,0.989024,0.698523
8,0.575600,1.021573,0.710686
9,0.575600,1.075202,0.688097
10,0.530200,1.050573,0.695917


TrainOutput(global_step=3830, training_loss=0.8575656582107744, metrics={'train_runtime': 5997.6383, 'train_samples_per_second': 10.212, 'train_steps_per_second': 0.639, 'total_flos': 1.579142731040835e+16, 'train_loss': 0.8575656582107744, 'epoch': 10.0})

In [22]:
!zip -r saved_model.zip /content/my_awesome_model/checkpoint-3830

  adding: content/my_awesome_model/checkpoint-3830/ (stored 0%)
  adding: content/my_awesome_model/checkpoint-3830/config.json (deflated 52%)
  adding: content/my_awesome_model/checkpoint-3830/rng_state.pth (deflated 25%)
  adding: content/my_awesome_model/checkpoint-3830/trainer_state.json (deflated 75%)
  adding: content/my_awesome_model/checkpoint-3830/tokenizer.json (deflated 76%)
  adding: content/my_awesome_model/checkpoint-3830/tokenizer_config.json (deflated 77%)
  adding: content/my_awesome_model/checkpoint-3830/sentencepiece.bpe.model (deflated 49%)
  adding: content/my_awesome_model/checkpoint-3830/special_tokens_map.json (deflated 52%)
  adding: content/my_awesome_model/checkpoint-3830/optimizer.pt (deflated 70%)
  adding: content/my_awesome_model/checkpoint-3830/scheduler.pt (deflated 56%)
  adding: content/my_awesome_model/checkpoint-3830/training_args.bin (deflated 51%)
  adding: content/my_awesome_model/checkpoint-3830/model.safetensors (deflated 31%)


In [25]:
from google.colab import files

files.download('saved_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define the paths
source_file = 'saved_model.zip'  # The file you want to download
destination_folder = '/content/drive/My Drive/'  # Destination folder in your Google Drive

# Copy the file to Google Drive
shutil.copy(source_file, destination_folder)


Mounted at /content/drive


'/content/drive/My Drive/saved_model.zip'

In [27]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

events.out.tfevents.1706859239.424a2b673929.1003.0:   0%|          | 0.00/9.21k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/soltaniali/my_awesome_model/commit/3f77cc535dac8b335f7f6630eaf62d1b0cb68164', commit_message='End of training', commit_description='', oid='3f77cc535dac8b335f7f6630eaf62d1b0cb68164', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="soltaniali/my_awesome_model")

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [32]:
classifier("چقدر خوبه بیرون رفتن و بازی کردن خیلی قشنگه")

[{'label': 'HAPPY', 'score': 0.9471352100372314}]

In [38]:
dataset_test_yok

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1151
})

In [None]:
somewhere = classifier(dataset_test_yok['text'])

In [None]:
from sklearn.metrics import precision_recall_fscore_support

preds = somewhere['label']  # Assuming predictions are stored under the key 'predictions'
labels = dataset_test_yok['label']

# Extracting predictions and labels
preds = eval_results.predictions.argmax(-1)
labels = eval_results.label_ids

# Computing precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


In [36]:
eval_results

{'eval_loss': 0.9676249623298645,
 'eval_accuracy': 0.7072111207645526,
 'eval_runtime': 6.0758,
 'eval_samples_per_second': 189.442,
 'eval_steps_per_second': 11.85,
 'epoch': 10.0}