In [None]:
!pip install transformers



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset

load dataset

In [None]:
df = pd.read_csv("/content/IMDB movie review.csv", encoding='latin-1')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df = df.dropna()
df = df.rename(columns={"review": "text", "sentiment": "label"})
df["label"] = df["label"].map({"positive": 1, "negative": 0})
df = df.sample(frac=0.3, random_state=42)

split dataset

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"].tolist(), df["label"].tolist(), test_size=0.1, random_state=42)

load tokenizer

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenization function

In [None]:
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=256)

In [None]:
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

convert to hugging face dataset

In [None]:
train_dataset = HFDataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_labels})
val_dataset = HFDataset.from_dict({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"], "labels": val_labels})

load pre-trained model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


freeze all layer

In [None]:
for param in model.distilbert.parameters():
    param.requires_grad = False

unfreeze last two transformer layers

In [None]:
for layer in model.distilbert.transformer.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

Define training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=500,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

train model

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.278578
2,No log,0.218789


TrainOutput(global_step=450, training_loss=0.38097225613064234, metrics={'train_runtime': 1557.2569, 'train_samples_per_second': 1.155, 'train_steps_per_second': 0.289, 'total_flos': 119088191391744.0, 'train_loss': 0.38097225613064234, 'epoch': 2.0})

save model

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

evaluate model

In [None]:
trainer.evaluate()

{'eval_loss': 0.21878905594348907,
 'eval_runtime': 50.8728,
 'eval_samples_per_second': 1.966,
 'eval_steps_per_second': 0.491,
 'epoch': 2.0}

In [40]:
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits).item()
    sentiment = "positive" if prediction == 1 else "negative"
    return sentiment

In [41]:
sample_reviews = df["text"].sample(5).tolist()
for review in sample_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: {review}\nSentiment: {sentiment}\n")


Review: I am sorry to say that this film is indeed bad. It reminds me of a c-grade porn movie with one major difference: no porn.<br /><br />The story and dialogue needs a complete overhaul. Maybe then the bad acting would not have been as noticeable. At the very least, the pacing should have been picked up.<br /><br />While I accept that this had a low budget and the director did a good job visually given what little resources he had, he should have spent more time on the story or better yet, get someone else to write it. Many of the action scenes were just pointless.<br /><br />It was a complete waste of my time.
Sentiment: negative

Review: This is a very memorable spaghetti western. It has a great storyline, interesting characters, and some very good acting, especially from Rosalba Neri. Her role as the evil villainess in this film is truly classic. She steals every scene she is in, and expresses so much with her face and eyes, even when she's not speaking. Her performance is very 