In [10]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [11]:
# Import required libraries
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [12]:


# Step 1: Load the IMDB dataset (balanced dataset for binary classification)
dataset = load_dataset("imdb")

from datasets import concatenate_datasets

# Combine the train and test datasets into a single dataset for manual splitting
full_dataset = dataset["train"].train_test_split(test_size=0.5, seed=42)  # Split the original training dataset equally
combined_dataset = concatenate_datasets([full_dataset["train"], full_dataset["test"]])

# Shuffle and split the dataset into 80-20 train-test split
train_test = combined_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Show the new train and test dataset sizes
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train Dataset Size: 20000
Test Dataset Size: 5000


In [13]:


# Step 2: Tokenize the dataset using DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    """Preprocess the IMDB dataset by returning tokenized examples."""
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:

# Step 3: Load the DistilBERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # Binary classification
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)

# Optional: Freeze the base model parameters
for param in model.base_model.parameters():
    param.requires_grad = False

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:

# Step 4: Define the metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [16]:
# Step 5: Set up the Trainer
training_args = TrainingArguments(
    output_dir="./imdb_results",         # Directory to save model checkpoints and results
    learning_rate=2e-5,                  # Experiment with different learning rates
    per_device_train_batch_size=8,       # Experiment with batch size
    per_device_eval_batch_size=8,
    num_train_epochs=1,                  # Experiment with number of epochs
    weight_decay=0.01,                   # Regularization term
    eval_strategy="epoch",               # Evaluate after every epoch
    save_strategy="epoch",               # Save model after every epoch
    load_best_model_at_end=True,         # Load the best model at the end of training
    logging_dir="./logs",                # Directory to save training logs
    logging_steps=10,                    # Log every 10 steps
    report_to="none"                     # Disable WandB logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                                    # The DistilBERT model with classification head
    args=training_args,                             # Training arguments defined above
    train_dataset=tokenized_train,                  # Preprocessed training dataset
    eval_dataset=tokenized_test,                    # Preprocessed testing dataset
    tokenizer=tokenizer,                            # Tokenizer for preprocessing
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),  # Handle dynamic padding
    compute_metrics=compute_metrics,               # Custom function to calculate accuracy
)

  trainer = Trainer(


In [17]:
# Step 6: Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4507,0.459089,0.821


TrainOutput(global_step=2500, training_loss=0.5400327877044677, metrics={'train_runtime': 409.7758, 'train_samples_per_second': 48.807, 'train_steps_per_second': 6.101, 'total_flos': 2649347973120000.0, 'train_loss': 0.5400327877044677, 'epoch': 1.0})

In [18]:


# Step 7: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


Evaluation Results: {'eval_loss': 0.45908868312835693, 'eval_accuracy': 0.821, 'eval_runtime': 77.057, 'eval_samples_per_second': 64.887, 'eval_steps_per_second': 8.111, 'epoch': 1.0}


In [19]:

# Step 8: Analyze predictions
df = pd.DataFrame(tokenized_test)
df["text"] = test_dataset["text"]
df["true_label"] = df["label"]
predictions = trainer.predict(tokenized_test)
df["predicted_label"] = np.argmax(predictions[0], axis=1)




In [20]:
# Display predictions
df[df["true_label"] == df["predicted_label"]].head()

Unnamed: 0,label,input_ids,attention_mask,text,true_label,predicted_label
0,0,"[101, 1996, 2190, 2518, 2055, 2023, 3185, 2001...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","The best thing about this movie was, uh, well,...",0,0
1,1,"[101, 2293, 8132, 2003, 2025, 1037, 2460, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","Love Trap is not a short, it's quite obviously...",1,1
2,0,"[101, 1000, 2796, 8940, 2598, 1000, 1024, 2065...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","""Indian burial ground"": If those three words a...",0,0
3,1,"[101, 2802, 2023, 2143, 1010, 2017, 2453, 2228...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","Throughout this film, you might think this fil...",1,1
4,1,"[101, 2009, 2003, 1037, 24026, 2518, 2000, 237...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",It is a tricky thing to play a queen. On the o...,1,1
