<a href="https://colab.research.google.com/github/shaghimmtz/Persian-Twitter-Sentiment-Analysis/blob/main/code1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [None]:
!pip install transformers datasets torch scikit-learn tqdm




In [None]:
labeled = pd.read_excel("labeled_tweets.xlsx")
unlabeled = pd.read_csv("unlabeled_tweets.csv")

In [None]:
labeled.rename(columns={"cleaned_text_des": "text", "sentiment": "label"}, inplace=True)
unlabeled.rename(columns={"title": "text"}, inplace=True)

In [None]:
unlabeled.columns

Index(['text', 'description'], dtype='object')

In [None]:
def normalize_fa(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace("ي", "ی").replace("ك", "ک")
    return text

labeled["text"] = labeled["text"].apply(normalize_fa)
unlabeled["text"] = unlabeled["text"].apply(normalize_fa)

print(labeled.head())
print(unlabeled.head())


                                                text  label
0  در این چهل سال فتح کردیم +سوپرکماندوهای آمریکا...    1.0
1  مردم سلحشور دزفول درد دفاع مقدس با وجود اصابت ...   -1.0
2  ۱۲ خرداد ، روزِ اتحاد و همبستگی یک کشوره! ۱۲ خ...    0.0
3  یعنی شهید"سعید علی نژاد" که جان داد ولی یک وجب...    0.0
4  بهاران در بهاران گشت ایران زخون سربداران آن شه...   -1.0
  text                                        description
0   RT   صداهامان را برای  محمد قبادلو بلند کنیم.نگذار...
1   RT   فتح خرمشهر، آن چنان سترگ بود که دستگاه های تب...
2   RT   نام #ادیا_عارفانی در سایت https://t.co/1HXHPJ...
3   RT   آی آدمها، این مدال‌ها  را که روزی بر ستبر سین...
4   RT   آزادسازی خرمشهر، کاری غیرممکن به نظر می‌رسید ...


In [None]:
labeled = labeled.dropna(subset=['label'])


labeled['label'] = labeled['label'].replace(-1, 2)


print(labeled['label'].unique())
print(labeled['label'].value_counts())

[1. 2. 0.]
label
0.0    1420
1.0     357
2.0     220
Name: count, dtype: int64


In [None]:
MODEL_NAME = "HooshvareLab/bert-fa-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item


In [None]:
train_dataset = TweetDataset(
    texts=labeled['text'].tolist(),
    labels=labeled['label'].tolist(),
    tokenizer=tokenizer
)


In [None]:
from transformers import Trainer, TrainingArguments
import os


os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=2e-5,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


In [None]:
trainer.train()




Step,Training Loss


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm
import numpy as np

model.eval()
pred_labels = []
confidences = []

for text in tqdm(unlabeled['text'].tolist()):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).numpy()[0]
        label = np.argmax(probs)
        confidence = np.max(probs)
    pred_labels.append(label)
    confidences.append(confidence)

unlabeled['pred_label'] = pred_labels
unlabeled['confidence'] = confidences


In [None]:
confident_data = unlabeled[unlabeled["confidence"] > 0.85]
print(f"نمونه‌های مطمئن مدل: {len(confident_data)} از {len(unlabeled)}")

confident_data.to_csv("auto_labeled_tweets.csv", index=False)
