In [3]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [None]:
def load_imdb_data(data_dir):
    texts, labels = [], []
    for label_type in ["pos", "neg"]:
        dir_name = os.path.join(data_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith(".txt"):
                with open(os.path.join(dir_name, fname), encoding="utf-8") as f:
                    texts.append(f.read())
                labels.append(1 if label_type == "pos" else 0)
    return texts, labels

train_texts, train_labels = load_imdb_data("/kaggle/input/imdb-dataset/aclImdb/train")
test_texts, test_labels = load_imdb_data("/kaggle/input/imdb-dataset/aclImdb/test")

print("Train samples:", len(train_texts))
print("Test samples:", len(test_texts))
print("Example review:", train_texts[0][:500])  


Train samples: 25000
Test samples: 25000
Example review: This was one of those wonderful rare moments in T.V. that I wished I'd captured forever on VHS. Won't it ever air again? <br /><br />It was so creative and I remember it was aired once a week and the wait for the next episode was excruciating. I want to see it all again. I want to buy it. I want what I can't have. Not even on EBAY. <br /><br />So, having ranted enough it was, by far, one of the best series the 80's put out. It should be considered a classic but is lost in space. At least this we


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)  
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_texts = [clean_text(t) for t in train_texts]
test_texts = [clean_text(t) for t in test_texts]

In [6]:
tfidf = TfidfVectorizer(max_features=20000)
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, train_labels)
y_pred = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(test_labels, y_pred))
print("Logistic Regression F1:", f1_score(test_labels, y_pred))


Logistic Regression Accuracy: 0.88492
Logistic Regression F1: 0.8849522133802536


In [8]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam


tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

y_train = np.array(train_labels)
y_test = np.array(test_labels)


lstm_model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),                    
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])


lstm_model.compile(
    loss="binary_crossentropy",
    optimizer=Adam(learning_rate=1e-3),
    metrics=["accuracy"]
)

lstm_model.summary()


history = lstm_model.fit(
    X_train_pad,
    y_train,
    batch_size=64,
    epochs=10,               
    validation_split=0.2,
    verbose=1
)

lstm_loss, lstm_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=1)
print("LSTM Accuracy:", lstm_acc)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 537ms/step - accuracy: 0.7028 - loss: 0.5732 - val_accuracy: 0.7324 - val_loss: 0.6280
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 533ms/step - accuracy: 0.8729 - loss: 0.3259 - val_accuracy: 0.7940 - val_loss: 0.5014
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 533ms/step - accuracy: 0.9033 - loss: 0.2559 - val_accuracy: 0.7468 - val_loss: 0.5482
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 532ms/step - accuracy: 0.9352 - loss: 0.1850 - val_accuracy: 0.7442 - val_loss: 0.6622
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 534ms/step - accuracy: 0.9493 - loss: 0.1441 - val_accuracy: 0.7020 - val_loss: 0.7644
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 534ms/step - accuracy: 0.9580 - loss: 0.1171 - val_accuracy: 0.8106 - val_loss: 0.7176
Epoc

In [10]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


class IMDbDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)


train_dataset = IMDbDataset(train_texts, train_labels, tokenizer) 
test_dataset = IMDbDataset(test_texts, test_labels, tokenizer)


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,               
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,               
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=2,
    save_total_limit=2,              
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=None,         
    compute_metrics=compute_metrics
)


trainer.train()


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2469,0.22561,0.90952,0.910195
2,0.1794,0.233815,0.91016,0.910788
3,0.1336,0.239132,0.91424,0.914643




TrainOutput(global_step=1173, training_loss=0.2090785084681336, metrics={'train_runtime': 1529.8877, 'train_samples_per_second': 49.023, 'train_steps_per_second': 0.767, 'total_flos': 4967527449600000.0, 'train_loss': 0.2090785084681336, 'epoch': 3.0})