In [1]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [2]:
# Load IMDb
dataset = load_dataset("stanfordnlp/imdb")

train_dataset = dataset["train"]
test_dataset = dataset["test"]

print("Train label counts:")
print(train_dataset.features["label"].names)
print({
    0: sum(1 for x in train_dataset["label"] if x == 0),
    1: sum(1 for x in train_dataset["label"] if x == 1)
})


X_train = train_dataset["text"]
y_train = train_dataset["label"]

X_test = test_dataset["text"]
y_test = test_dataset["label"]

print("\nTest label counts:")
print({
    0: sum(1 for x in test_dataset["label"] if x == 0),
    1: sum(1 for x in test_dataset["label"] if x == 1)
})

len(X_train), len(X_test)

Train label counts:
['neg', 'pos']
{0: 12500, 1: 12500}

Test label counts:
{0: 12500, 1: 12500}


(25000, 25000)

In [12]:
# Bag-of-words vectorizer
bow_vectorizer = CountVectorizer(
    max_features=20000, 
    ngram_range=(1, 1),      # unigrams only
    stop_words="english",    # remove common stopwords
)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

log_reg = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
)

log_reg.fit(X_train_bow, y_train)
y_pred_bow = log_reg.predict(X_test_bow)

acc_bow = accuracy_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)
precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)

print(f"Bag-of-Words + Logistic Regression")
print(f"Accuracy: {acc_bow:.4f}")
print(f"F1: {f1_bow:.4f}")
print(f"Precision: {precision_bow:.4f}")
print(f"Recall: {recall_bow:.4f}")

Bag-of-Words + Logistic Regression
Accuracy: 0.8551
F1: 0.8537
Precision: 0.8621
Recall: 0.8454


In [13]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),      # unigrams + bigrams (captures short phrases)
    stop_words="english",
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_clf = LinearSVC()
svm_clf.fit(X_train_tfidf, y_train)

y_pred_tfidf = svm_clf.predict(X_test_tfidf)

# Metrics
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)

print(f"TF–IDF + Linear SVM")
print(f"Accuracy: {acc_tfidf:.4f}")
print(f"F1: {f1_tfidf:.4f}")
print(f"Precision: {precision_tfidf:.4f}")
print(f"Recall: {recall_tfidf:.4f}")


TF–IDF + Linear SVM
Accuracy: 0.8698
F1: 0.8685
Precision: 0.8771
Recall: 0.8601
