In [None]:
# نصب کتابخانه‌های لازم
!pip install transformers torch --quiet

# ایمپورت‌ها
from transformers import AutoTokenizer, AutoModel

# نام مدل DistilBERT
model_name = "distilbert-base-uncased"

# دانلود و بارگذاری توکنایزر و مدل
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("✅ DistilBERT model and tokenizer loaded successfully!")


✅ DistilBERT model and tokenizer loaded successfully!


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = f"{path}/IMDB Dataset.csv"

df = pd.read_csv(csv_path)

df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
import torch
from tqdm import tqdm
import numpy as np

def get_features(texts, tokenizer, model, batch_size=16, max_length=256):
    """
    استخراج embedding از یک مدل ترنسفورمر با استفاده از GPU (در صورت وجود).
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
            batch_texts = list(texts[i:i + batch_size])

            # توکنایز کردن و ارسال مستقیم به GPU
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            encoded = {k: v.to(device) for k, v in encoded.items()}  # ✅ تمام تنسورها به GPU

            outputs = model(**encoded)

            # گرفتن میانگین embeddingها از آخرین لایه
            last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
            embeddings = last_hidden_state.mean(dim=1).cpu().numpy()  # ✅ برگردوندن به CPU برای ذخیره
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)


برای گرفتن نتیجه بهتر تگ های html رو حذف میکنیم

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

df['review'] = df['review'].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
embedding = get_features(df['review'].values, tokenizer, model)
df['embedding'] = embedding.tolist()
save_path = "/content/drive/MyDrive/imdb_with_embeddings.parquet"
df.to_parquet(save_path, index=False)


Extracting embeddings:   2%|▏         | 70/3125 [00:08<05:55,  8.59it/s]


KeyboardInterrupt: 

In [None]:
df_loaded = pd.read_parquet(save_path)

NameError: name 'save_path' is not defined

In [None]:
df_loaded.head()

Unnamed: 0,review,sentiment,label,embedding
0,One of the other reviewers has mentioned that ...,positive,1,"[0.005336681380867958, -0.02273288369178772, 0..."
1,A wonderful little production. The filming tec...,positive,1,"[-0.09763307124376297, 0.1235554963350296, 0.2..."
2,I thought this was a wonderful way to spend ti...,positive,1,"[-0.1955944150686264, -0.10805610567331314, 0...."
3,Basically there's a family where a little boy ...,negative,0,"[0.1273573935031891, 0.06243567913770676, 0.32..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,"[-0.206196129322052, 0.069491446018219, 0.1950..."


In [None]:
X = np.vstack(df_loaded['embedding'].values)
y = df_loaded['label'].values
indices = np.arange(len(df_loaded))

X_train, X_test, Y_train, Y_test, idx_train, idx_test = train_test_split(
    X, y, indices,
    test_size=0.2,
    random_state=42,
    stratify=y
)

دقت مدل روی دیتای آموزش را هم چاپ میکنیم تا ببینیم overfit رخ داده یا نه.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, Y_train)

y_test_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)
print("Train_Accuracy", accuracy_score(Y_train, y_test_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))


Train_Accuracy 0.874875
Accuracy: 0.8702


چک میکنیم که داده های مثبت و منفی بالانس باشند.

In [None]:
df['label'].value_counts(normalize=True).plot(kind='bar')

KeyError: 'label'

داده ها بالانس هستند. یک سری نمونه از داده های اشتباه برچسب زده شده میگیریم تا ببینیم میتونیم ایده ای راجب اشتباهات مدل بگیریم یا نه.

In [None]:
df_errors = pd.DataFrame({
    'review': df_loaded.loc[idx_test, 'review'].values,
    'y_true': Y_test,
    'y_pred': y_pred
})

df_errors = df_errors[df_errors['y_true'] != df_errors['y_pred']]

df_errors.head()

Unnamed: 0,review,y_true,y_pred
0,"Yes, MTV there really is a way to market Daria...",0,1
10,Although Humphrey Bogart got star billing in K...,0,1
14,Little Quentin seems to have mastered the art ...,0,1
16,Just watched this movie on DVD and thought the...,1,0
24,Not on the same level as Ring (or Ring 2) but ...,1,0


In [None]:
df

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


با حذف نشانه های html یک درصد نتیجه بهتری گرفتیم.

In [None]:
# ==============================
# 📦 Import libraries
# ==============================
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# ==============================
# ⚙️ Feature Extraction Function
# ==============================
def get_features(df, tokenizer, model, save_path, batch_size=16, max_length=256):
    """
    استخراج چند نوع embedding از مدل ترنسفورمر:
    mean, cls, max, mean_last2, concat_last2
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_rows = []

    with torch.no_grad():
        for i in tqdm(range(0, len(df), batch_size), desc="Extracting & saving embeddings"):
            batch_df = df.iloc[i:i+batch_size]
            batch_texts = batch_df['review'].tolist()

            encoded = tokenizer(batch_texts, padding=True, truncation=True,
                                max_length=max_length, return_tensors='pt')
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model(**encoded, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            last_hidden = hidden_states[-1]          # آخرین لایه
            second_last = hidden_states[-2]          # یکی‌مانده‌به‌آخر

            # ===== انواع embedding =====
            mean_emb = last_hidden.mean(dim=1).cpu().numpy()
            cls_emb = last_hidden[:, 0, :].cpu().numpy()
            max_emb = last_hidden.max(dim=1).values.cpu().numpy()
            mean_last2_emb = ((last_hidden.mean(dim=1) + second_last.mean(dim=1)) / 2).cpu().numpy()
            concat_last2_emb = torch.cat(
                (last_hidden.mean(dim=1), second_last.mean(dim=1)), dim=1
            ).cpu().numpy()

            # ساخت دیتافریم از همه
            batch_out = pd.DataFrame({
                'review': batch_texts,
                'label': batch_df['label'].tolist(),
                'embedding_mean': mean_emb.tolist(),
                'embedding_cls': cls_emb.tolist(),
                'embedding_max': max_emb.tolist(),
                'embedding_mean_last2': mean_last2_emb.tolist(),
                'embedding_concat_last2': concat_last2_emb.tolist()
            })

            all_rows.append(batch_out)

            # ذخیره مرحله‌ای برای کاهش حافظه
            if (i // batch_size) % 1000 == 0 and i > 0:
                pd.concat(all_rows).to_parquet(save_path, index=False)
                all_rows = []
                torch.cuda.empty_cache()

    if all_rows:
        pd.concat(all_rows).to_parquet(save_path, index=False)

    print("✅ Embeddings extracted and saved.")
    return pd.read_parquet(save_path)

# ==============================
# 📤 Extract and Save Embeddings
# ==============================
save_path = "/content/drive/MyDrive/imdb_with_embeddings.parquet"
df_with_emb = get_features(df, tokenizer, model, save_path)

print("✅ File saved at:", save_path)
print("📊 Columns:", df_with_emb.columns.tolist())

# ==============================
# 📥 Reload and Prepare Embeddings
# ==============================
df = pd.read_parquet(save_path)

embeddings = {
    name.replace('embedding_', ''): np.array(df[name].tolist())
    for name in df.columns if name.startswith('embedding_')
}

print("✅ Embeddings loaded.")
print("📊 Available:", list(embeddings.keys()))

# ==============================
# 🧠 Train & Evaluate Models
# ==============================
results = {}

for name, X in embeddings.items():
    print(f"\n🔹 Training Logistic Regression for embedding: {name}")

    X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, y_train)

    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"✅ Train Accuracy ({name}): {train_acc:.4f}")
    print(f"✅ Test Accuracy  ({name}): {test_acc:.4f}")

    results[name] = {'train_acc': train_acc, 'test_acc': test_acc}

# ==============================
# 📊 Summary
# ==============================
print("\n📊 Summary of results:")
for name, res in results.items():
    print(f"{name:<18} → Train: {res['train_acc']:.4f} | Test: {res['test_acc']:.4f}")


Extracting & saving embeddings: 100%|██████████| 124/124 [00:15<00:00,  7.91it/s]


✅ Embeddings extracted and saved.
✅ File saved at: /content/drive/MyDrive/imdb_with_embeddings.parquet
📊 Columns: ['review', 'label', 'embedding_mean', 'embedding_cls', 'embedding_max', 'embedding_mean_last2', 'embedding_concat_last2']
✅ Embeddings loaded.
📊 Available: ['mean', 'cls', 'max', 'mean_last2', 'concat_last2']

🔹 Training Logistic Regression for embedding: mean
✅ Train Accuracy (mean): 0.9049
✅ Test Accuracy  (mean): 0.8287

🔹 Training Logistic Regression for embedding: cls
✅ Train Accuracy (cls): 0.9099
✅ Test Accuracy  (cls): 0.7859

🔹 Training Logistic Regression for embedding: max
✅ Train Accuracy (max): 0.9565
✅ Test Accuracy  (max): 0.7305

🔹 Training Logistic Regression for embedding: mean_last2
✅ Train Accuracy (mean_last2): 0.9175
✅ Test Accuracy  (mean_last2): 0.8287

🔹 Training Logistic Regression for embedding: concat_last2
✅ Train Accuracy (concat_last2): 0.9439
✅ Test Accuracy  (concat_last2): 0.8237

📊 Summary of results:
mean               → Train: 0.9049 | T