In [1]:
import ollama
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

import os
import warnings
from utils import cv_trainer
from base import persian_text_preprocessing
warnings.filterwarnings('ignore')

In [2]:
ds = load_dataset("hezarai/sentiment-dksf")
ds = ds.map(lambda example: {**example, 'text': persian_text_preprocessing(example['text'])}, 
           batched=False)
train_df: pd.DataFrame = ds["train"].to_pandas() # type: ignore
test_df: pd.DataFrame = ds["test"].to_pandas() # type: ignore
df = pd.concat([train_df, test_df])

Using the latest cached version of the dataset since hezarai/sentiment-dksf couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/tobysmith/.cache/huggingface/datasets/hezarai___sentiment-dksf/default/0.0.0/b4d5a8dd501db610b5ad89e9aa13f863b842b395 (last modified on Sun Jun 29 12:59:27 2025).


Map:   0%|          | 0/28602 [00:00<?, ? examples/s]

Map:   0%|          | 0/2315 [00:00<?, ? examples/s]

# Dense Embeddings

In [3]:
def create_embeddings_batch(texts, model="nomic-embed-text", batch_size=300):
    all_embeddings = []
    texts_list = texts if isinstance(texts, list) else texts.to_list()
    
    for i in tqdm(range(0, len(texts_list), batch_size), desc="Creating embeddings"):
        batch = texts_list[i:i + batch_size]
        response = ollama.embed(model=model, input=batch)
        all_embeddings.extend(response["embeddings"])
    
    return np.array(all_embeddings)

embeddings_file = 'comments_embeddings.npy'

if os.path.exists(embeddings_file):
    print(f"Loading existing embeddings from {embeddings_file}")
    comments_dense_embeddings = np.load(embeddings_file)
else:
    print("Creating new embeddings...")
    comments_dense_embeddings = create_embeddings_batch(df["text"])
    
    np.save(embeddings_file, comments_dense_embeddings)
    print(f"Embeddings saved to {embeddings_file}")

print(f"Embeddings shape: {comments_dense_embeddings.shape}")

Loading existing embeddings from comments_embeddings.npy
Embeddings shape: (30917, 768)


In [4]:
X_dense, y_dense = comments_dense_embeddings, df["label"].to_numpy()

train_size = len(train_df)
X_train_dense = X_dense[:train_size]
X_test_dense = X_dense[train_size:]
y_train_dense = y_dense[:train_size]
y_test_dense = y_dense[train_size:]

In [5]:
models_dense = {
     'logistic_regression': (
        Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classifier', LogisticRegression(random_state=42, max_iter=1000))
        ]),
        {
            'scaler': [None, StandardScaler()],
            'pca': [None, PCA(50), PCA(100), PCA(200)],
            'classifier__C': [0.1, 1.0, 10.0],
            'classifier__solver': ['liblinear', 'lbfgs'],
            'classifier__penalty': ['l1', 'l2'],
        }
    ),
    
    'gaussianNB': (
        Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classifier', GaussianNB())
        ]),
        {
            'scaler': [None, StandardScaler(), MinMaxScaler()],
            'pca': [None, PCA(50), PCA(100), PCA(200)],
            'classifier__var_smoothing': [1e-5, 1e-7, 1e-9]
        }
    ),
    
    'random_forest': (
        Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        {
            'scaler': [None, StandardScaler()],
            'pca': [None, PCA(100), PCA(200)],
            'classifier__n_estimators': [25, 75],
            'classifier__max_depth': [7, 12],
            'classifier__min_samples_split': [2, 5],
        }
    ),
    
    'mlp': (
        Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classifier', MLPClassifier(random_state=42, max_iter=500))
        ]),
        {
            'pca': [PCA(100), PCA(200)],
            'classifier__hidden_layer_sizes': [(100,), (100, 50), (200, 100)],
            'classifier__alpha': [0.0001, 0.001, 0.01],
            'classifier__learning_rate': ['constant', 'adaptive']
        }
    )
}

In [7]:
dense_results_output_path = 'dense_results.pkl'
if not os.path.exists(dense_results_output_path):
    print("Training dense embedding models...")
    dense_results = cv_trainer(
        X_train_dense, X_test_dense, y_train_dense, y_test_dense, 
        cv=3, models=models_dense # type: ignore
    )
    joblib.dump(dense_results, dense_results_output_path)
else:
    dense_results = joblib.load(dense_results_output_path)
    print("Loaded dense results from file")

Loaded dense results from file


In [8]:
dense_results = joblib.load(dense_results_output_path)
best_dense_model = dense_results['random_forest']['best_estimator']
print(f"best model: {dense_results['summary']['best_model_name']} params: {dense_results['random_forest']['best_params']}")
y_pred_dense = best_dense_model.predict(X_test_dense)
print(classification_report(y_test_dense, y_pred_dense))
print(confusion_matrix(y_test_dense, y_pred_dense))

best model: random_forest params: {'classifier__max_depth': 12, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 75, 'pca': None, 'scaler': None}
              precision    recall  f1-score   support

           0       0.74      0.82      0.78      1107
           1       0.83      0.64      0.72      1032
           2       0.50      0.86      0.64       176

    accuracy                           0.74      2315
   macro avg       0.69      0.77      0.71      2315
weighted avg       0.76      0.74      0.74      2315

[[903 130  74]
 [300 656  76]
 [ 18   6 152]]


In [10]:
digikala_scraped_comments_df = pd.read_csv('incredible_offers_product_comments_finalized_labels.csv', index_col=None)
digikala_scraped_comments_embeddings = create_embeddings_batch(digikala_scraped_comments_df['text'])
print("Test model performance on scraped digikala comments: ")
y_pred_digikala_dense = best_dense_model.predict(digikala_scraped_comments_embeddings)
print(classification_report(digikala_scraped_comments_df['label'].to_list(), y_pred_digikala_dense))
print(confusion_matrix(digikala_scraped_comments_df['label'].to_list(), y_pred_digikala_dense))

Creating embeddings: 100%|██████████| 9/9 [00:56<00:00,  6.29s/it]

Test model performance on scraped digikala comments: 
              precision    recall  f1-score   support

           0       0.36      0.63      0.45       507
           1       0.89      0.59      0.71      1834
           2       0.08      0.19      0.11       154

    accuracy                           0.57      2495
   macro avg       0.44      0.47      0.43      2495
weighted avg       0.73      0.57      0.62      2495

[[ 320   92   95]
 [ 504 1082  248]
 [  76   48   30]]





# Discrete Embeddings

In [11]:
train_size = len(train_df)
X_discrete, y_discrete = df["text"].to_numpy(), df["label"].to_numpy()
X_train_discrete, X_test_discrete, y_train_discrete, y_test_discrete = X_discrete[:train_size], X_discrete[train_size:], y_discrete[:train_size], y_discrete[train_size:]

In [12]:
persian_stop_words = [
    # Articles and determiners
    'این', 'آن', 'یک', 'هر', 'همه', 'تمام', 'کل', 'چند', 'بعض', 'برخی',
    
    # Pronouns
    'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'خود', 'خودم', 'خودت', 'خودش',
    
    # Prepositions (neutral ones)
    'در', 'به', 'از', 'با', 'تا', 'روی', 'زیر', 'کنار', 'داخل', 'خارج',
    
    # Simple conjunctions (not contrastive)
    'و', 'یا', 'که', 'چون', 'وقتی', 'زمانی', 'هنگامی',
    
    # Neutral verbs
    'می', 'شود', 'کند', 'دهد', 'است', 'باشد' ,'است', 'بود', 'شد', 'کرد', 'داشت', 'دارد', 'خواهد', 'باید', 'می‌شود',
    
    # Object markers and particles
    'را', 'رو', 'های', 'ها', 'ان', 'ات', 'تان', 'شان',
    
    # Time references (neutral)
    'امروز', 'دیروز', 'فردا', 'حالا', 'الان', 'وقت', 'زمان',
    
    # Place references
    'اینجا', 'آنجا', 'کجا', 'جا', 'محل', 'مکان',
    
    # Common neutral words
    'چیز', 'کار', 'راه', 'نوع', 'قسم', 'طور', 'مثل', 'مانند', 'نام', 'اسم'
]

models_discrete = {
    'logistic_tfidf': (
        Pipeline([
            ('vectorizer', TfidfVectorizer()), 
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('logisticClassifier', LogisticRegression())
        ]),
        {
            'vectorizer__stop_words': [None, persian_stop_words],
            'vectorizer__max_features': [500, 2000, None],
            'scaler': [None, StandardScaler(with_mean=False)],
            'pca': [None, PCA(50), PCA(100)],
            'logisticClassifier__C': [0.01, 0.1, 1, 10],
            'logisticClassifier__penalty': ['l2'],
            'logisticClassifier__max_iter': [1000], 
        }
    ),
    
    'naive_bayes_count': (
        Pipeline([
            ('vectorizer', CountVectorizer()),
            ('classifier', MultinomialNB())
        ]),
        {
            'vectorizer__stop_words': [None, persian_stop_words],
            'vectorizer__max_features': [500, 1000, 2000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'vectorizer__min_df': [1, 2],
            'classifier__alpha': [0.1, 0.5, 1.0, 2.0]
        }
    ),
    
    'random_forest_tfidf': (
        Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        {
            'vectorizer__stop_words': [None, persian_stop_words],
            'vectorizer__max_features': [1000, 2000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [10, 20, None],
        }
    ),
}

In [13]:
discrete_results_output_path = 'discrete_results.pkl'
if not os.path.exists(discrete_results_output_path):
    print("Training discrete embedding models...")
    discrete_results = cv_trainer(
        X_train_discrete, X_test_discrete, y_train_discrete, y_test_discrete, 
        cv=3, models=models_discrete # type: ignore
    )
    joblib.dump(discrete_results, discrete_results_output_path)
else:
    discrete_results = joblib.load(discrete_results_output_path)
    print("Loaded discrete results from file")

Loaded discrete results from file


In [14]:
best_model_name = discrete_results['summary']['best_model_name']
best_discrete_model = discrete_results[best_model_name]['best_estimator']
print(f"best model: {discrete_results['summary']['best_model_name']} params: {discrete_results['random_forest_tfidf']['best_params']}")
y_pred_discrete = best_discrete_model.predict(X_test_discrete)
print(classification_report(y_test_discrete, y_pred_discrete))
print(confusion_matrix(y_test_discrete, y_pred_discrete))

best model: random_forest_tfidf params: {'classifier__max_depth': None, 'classifier__n_estimators': 200, 'vectorizer__max_features': 2000, 'vectorizer__ngram_range': (1, 2), 'vectorizer__stop_words': None}
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      1107
           1       0.93      0.79      0.86      1032
           2       0.51      0.98      0.68       176

    accuracy                           0.85      2315
   macro avg       0.78      0.88      0.80      2315
weighted avg       0.88      0.85      0.85      2315

[[975  57  75]
 [128 816  88]
 [  2   1 173]]


In [15]:
feature_names = best_discrete_model.named_steps['vectorizer'].get_feature_names_out()
feature_importances = best_discrete_model.named_steps['classifier'].feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print(feature_importance_df.head(10))

       feature  importance
1151      عالی    0.026685
115       اصلا    0.019080
428        بود    0.018924
1154     عالیه    0.013610
685        خوب    0.010724
724       خیلی    0.009948
1681       ولی    0.009054
721     خوشمزه    0.007562
689       خوبه    0.007491
1152  عالی بود    0.007170


In [16]:
digikala_scraped_comments_df = pd.read_csv('incredible_offers_product_comments_finalized_labels.csv', index_col=None)
print("Test model performance on scraped digikala comments: ")
y_pred_digikala_dense = best_discrete_model.predict(digikala_scraped_comments_df['text'])
print(classification_report(digikala_scraped_comments_df['label'].to_list(), y_pred_digikala_dense))
print(confusion_matrix(digikala_scraped_comments_df['label'].to_list(), y_pred_digikala_dense))

Test model performance on scraped digikala comments: 
              precision    recall  f1-score   support

           0       0.63      0.60      0.61       507
           1       0.94      0.75      0.83      1834
           2       0.15      0.56      0.24       154

    accuracy                           0.70      2495
   macro avg       0.58      0.63      0.56      2495
weighted avg       0.83      0.70      0.75      2495

[[ 304   57  146]
 [ 137 1367  330]
 [  42   26   86]]
