In [None]:
!pip install transformers torch pandas numpy tqdm hazm emoji -q
!pip install accelerate -U -q

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import logging
import numpy as np

print("all the libraries are installed")

In [None]:
!pip uninstall -y numpy
!pip uninstall -y transformers torch


In [None]:
!pip install numpy==2.1.1
!pip install torch==2.4.1
!pip install transformers==4.45.2
!pip install accelerate==1.1.1
!pip install pandas tqdm hazm emoji


In [None]:
!pip uninstall -y torch torchvision torchaudio


In [None]:
!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121


In [None]:
!pip install transformers==4.45.2 accelerate==1.1.1 numpy==2.1.1 pandas tqdm hazm emoji


In [None]:

!pip install transformers torch pandas tqdm -q

import pandas as pd
import torch
from transformers import pipeline
from tqdm import tqdm
import logging


logging.getLogger("transformers").setLevel(logging.ERROR)
tqdm.pandas()


class PersianTweetLabeler:
    def __init__(self, model_name="dadashzadeh/roberta-sentiment-persian"):
        self.model_name = model_name
        self.device = 0 if torch.cuda.is_available() else -1
        print(f"🔄 در حال بارگذاری مدل: {model_name}")


        self.classifier = pipeline(
            "text-classification",
            model=model_name,
            tokenizer=model_name,
            device=self.device,
            truncation=True,
            max_length=256
        )
        print("✅ مدل با موفقیت بارگذاری شد!\n")

    def label_tweets_batch(self, tweets, batch_size=32):
        """لیبل‌زنی دسته‌ای با مدیریت حافظه و خطا"""
        results = []

        for i in tqdm(range(0, len(tweets), batch_size), desc="📊 لیبل‌زنی"):
            batch = tweets[i:i + batch_size]

            try:
                predictions = self.classifier(batch)
                for tweet, pred in zip(batch, predictions):
                    results.append({
                        'text': tweet,
                        'label': pred['label'],
                        'confidence': round(pred['score'], 4),
                        'batch_id': i // batch_size
                    })
            except Exception as e:
                print(f"⚠️ خطا در دسته {i}: {e}")

                for tweet in batch:
                    try:
                        pred = self.classifier(tweet)[0]
                        results.append({
                            'text': tweet,
                            'label': pred['label'],
                            'confidence': round(pred['score'], 4),
                            'batch_id': i // batch_size
                        })
                    except:
                        results.append({
                            'text': tweet,
                            'label': 'ERROR',
                            'confidence': 0.0,
                            'batch_id': i // batch_size
                        })
        return pd.DataFrame(results)


labeler = PersianTweetLabeler()

sample_texts = [
    "امروز خیلی خوشحالم 😊",
    "از این وضعیت خسته شدم 😞",
    "این محصول معمولی بود، نه خوب نه بد."
]
df = labeler.label_tweets_batch(sample_texts, batch_size=2)
print(df)


In [None]:


import pandas as pd
import numpy as np
import re
from google.colab import files
import io

print("Setting up Persian dataset preprocessor...")

class PersianDatasetPreprocessor:
    def __init__(self):
        self.processed_data = None
        print("Preprocessor ready!")

    def upload_dataset(self):
        print("Please upload your dataset file (CSV, Excel, JSON)...")
        uploaded = files.upload()

        if not uploaded:
            print("No file uploaded!")
            return None

        file_name = list(uploaded.keys())[0]
        print(f"File '{file_name}' uploaded successfully")

        try:
            if file_name.endswith('.csv'):
                df = pd.read_csv(io.BytesIO(uploaded[file_name]))
            elif file_name.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(io.BytesIO(uploaded[file_name]))
            elif file_name.endswith('.json'):
                df = pd.read_json(io.BytesIO(uploaded[file_name]))
            else:
                df = pd.read_csv(io.BytesIO(uploaded[file_name]))

            print(f"Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
            return df

        except Exception as e:
            print(f"Error reading file: {e}")
            return None

    def clean_persian_text(self, text):
        if pd.isna(text):
            return ""

        text = str(text)

        text = re.sub(r'@\w+', '', text)

        text = re.sub(r'http\S+', '', text)

        text = re.sub(r'#(\w+)', r'\1', text)

        text = re.sub(r'[‌\u200c]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)

        text = text.strip()

        return text

    def analyze_dataset(self, df):
        print("\nDataset Analysis:")
        print(f"   Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
        print(f"   Columns: {list(df.columns)}")

        print(f"\nColumn Information:")
        for col in df.columns:
            null_count = df[col].isnull().sum()
            sample_value = df[col].iloc[0] if len(df) > 0 else "N/A"
            print(f"   ├─ {col}: {null_count} null values | Sample: {str(sample_value)[:50]}...")

    def find_text_column(self, df):
        text_keywords = ['text', 'tweet', 'comment', 'review', 'نظر', 'متن', 'توئیت', 'کامنت']

        for col in df.columns:
            col_lower = col.lower()
            if any(keyword in col_lower for keyword in text_keywords):
                return col

        for col in df.columns:
            if df[col].dtype == 'object':
                return col

        return df.columns[0]

    def preprocess(self, df, text_column=None):
        print("Starting preprocessing...")

        if text_column is None:
            text_column = self.find_text_column(df)
            print(f"   Text column identified: '{text_column}'")

        processed_df = df.copy()

        initial_count = len(processed_df)
        processed_df = processed_df.dropna(how='all')
        print(f"   Removed completely empty rows: {initial_count - len(processed_df)} rows")

        print(f"   Cleaning column '{text_column}'...")
        processed_df['cleaned_text'] = processed_df[text_column].apply(self.clean_persian_text)

        initial_text_count = len(processed_df)
        processed_df = processed_df[processed_df['cleaned_text'].str.len() > 10]
        short_text_removed = initial_text_count - len(processed_df)
        print(f"   Removed short texts: {short_text_removed} rows")

        initial_unique = len(processed_df)
        processed_df = processed_df.drop_duplicates(subset=['cleaned_text'])
        duplicates_removed = initial_unique - len(processed_df)
        print(f"   Removed duplicates: {duplicates_removed} rows")

        print(f"\nPreprocessing completed!")
        print(f"   Before: {len(df)} rows")
        print(f"   After: {len(processed_df)} rows")
        print(f"   Removed: {len(df) - len(processed_df)} rows")

        self.processed_data = processed_df
        return processed_df

    def show_samples(self, n=5):
        if self.processed_data is not None:
            print(f"\n{n} samples of processed data:")
            samples = self.processed_data[['cleaned_text']].head(n)
            for i, (idx, row) in enumerate(samples.iterrows()):
                print(f"   {i+1}. {row['cleaned_text'][:80]}...")
        else:
            print("No data to display!")

preprocessor = PersianDatasetPreprocessor()

df = preprocessor.upload_dataset()

if df is not None:
    preprocessor.analyze_dataset(df)

    processed_df = preprocessor.preprocess(df)

    preprocessor.show_samples(5)

    print("\nDataset is ready! You can use processed_df.")
else:
    print("Operation stopped!")

In [None]:

cleaned_tweets = preprocessor.processed_data['cleaned_text'].tolist()

print(f"Number of cleaned tweets: {len(cleaned_tweets)}")

preprocessor.processed_data.to_csv('cleaned_dataset.csv', index=False, encoding='utf-8-sig')

from google.colab import files
files.download('cleaned_dataset.csv')

In [None]:
from transformers import pipeline
import re

model_name = "HooshvareLab/bert-fa-base-uncased"
classifier = pipeline("text-generation", model=model_name, tokenizer=model_name)

prompt_template = '''وظیفه: توییت‌های فارسی را به ۳ کلاس برچسب بزن: 0=منفی،1=خنثی،2=مثبت.
مثال‌ها:
"این محصول افتضاحه، پولمو دور ریختم" => 0
"امروز هم مثل دیروز بود، چیزی خاص نبود" => 1
"آفرین! چه سرویس سریعی داشتند 😊" => 2

متن: "{}"
پاسخ فقط عدد کلاس.'''


tweets = [
    "خیلی از این محصول راضی بودم 😍",
    "واقعا خسته شدم از این وضعیت",
    "امروز معمولی بود، نه بد بود نه خوب"
]

pred_labels = []

for tweet in tweets:
    prompt = prompt_template.format(tweet)
    output = classifier(prompt, max_new_tokens=5)[0]['generated_text']

    match = re.search(r"\b([012])\b", output)
    label = int(match.group(1)) if match else None
    pred_labels.append(label)

for t, l in zip(tweets, pred_labels):
    print(f"{t} => {l}")
