In [None]:
!pip install transformers torch --quiet

from transformers import AutoTokenizer, AutoModel

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("✅ DistilBERT model and tokenizer loaded successfully!")


✅ DistilBERT model and tokenizer loaded successfully!


In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = f"{path}/IMDB Dataset.csv"

df = pd.read_csv(csv_path)

df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [21]:
import torch
from tqdm import tqdm
import numpy as np

def get_features(texts, tokenizer, model, batch_size=16, max_length=256):
    """
    استخراج embedding از یک مدل ترنسفورمر با استفاده از GPU (در صورت وجود).
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
            batch_texts = list(texts[i:i + batch_size])

            # توکنایز کردن و ارسال مستقیم به GPU
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            encoded = {k: v.to(device) for k, v in encoded.items()}  # ✅ تمام تنسورها به GPU

            outputs = model(**encoded)

            # گرفتن میانگین embeddingها از آخرین لایه
            last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
            embeddings = last_hidden_state.mean(dim=1).cpu().numpy()  # ✅ برگردوندن به CPU برای ذخیره
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
embedding = get_features(df['review'].values, tokenizer, model)
df['embedding'] = embedding.tolist()
save_path = "/content/drive/MyDrive/imdb_with_embeddings.parquet"
df.to_parquet(save_path, index=False)


Extracting embeddings: 100%|██████████| 3125/3125 [06:00<00:00,  8.67it/s]


In [24]:
df_loaded = pd.read_parquet(save_path)

In [28]:
df_loaded.head()

Unnamed: 0,review,sentiment,label,embedding
0,One of the other reviewers has mentioned that ...,positive,1,"[-0.011808195151388645, 0.0068298932164907455,..."
1,A wonderful little production. <br /><br />The...,positive,1,"[-0.08786962926387787, 0.13217905163764954, 0...."
2,I thought this was a wonderful way to spend ti...,positive,1,"[-0.198312908411026, -0.08881082385778427, 0.1..."
3,Basically there's a family where a little boy ...,negative,0,"[0.0797632560133934, 0.09678228199481964, 0.44..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,"[-0.18335729837417603, 0.11402807384729385, 0...."


In [32]:
X = np.vstack(df_loaded['embedding'].values)
y = df_loaded['label'].values

# تقسیم داده‌ها
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, Y_train)

y_test_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)
print("Train_Accuracy", accuracy_score(Y_train, y_test_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))


Train_Accuracy 0.873175
Accuracy: 0.8675
