In [1]:
from google.colab import drive
drive.mount('/content/drive')
# adding the drive

Mounted at /content/drive


In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer , TFAutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import lxml
import os
import re
import string

In [4]:
print(f"pandas: {pd.__version__}")
print(f"numpy : {np.__version__}")
print(f"pytorch : {torch.__version__}")

pandas: 2.2.2
numpy : 2.0.2
pytorch : 2.8.0+cu126


In [5]:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")
print(df.isnull().sum())
print(f"dups: {df.duplicated().sum()}")


review       0
sentiment    0
dtype: int64
dups: 418


In [6]:
df.drop_duplicates(inplace=True)
print(f"dups: {df.duplicated().sum()}")

dups: 0


In [7]:
encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])


In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:

from lxml import html


def clean_text (text):
    text.lower()

    patters = re.compile(r'https?://\S+|www.\S+')
    patters.sub(" ",text)

    puns = string.punctuation
    text.translate(str.maketrans('','',puns))

    doc = html.fromstring(text)
    return doc.text_content()


df['reviews'] = df['review'].apply(clean_text)




In [10]:
from sklearn.model_selection import train_test_split

y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(df['reviews'],y,test_size = 0.2,random_state = 42)

In [9]:
trnsformr = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(trnsformr)

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

save_path =r"M:\MACHINE LEARNING\github\sentiment analysis\BERT_tokenizer"

tokenizer.save_pretrained(save_path)

('M:\\MACHINE LEARNING\\github\\sentiment analysis\\BERT_tokenizer/tokenizer_config.json',
 'M:\\MACHINE LEARNING\\github\\sentiment analysis\\BERT_tokenizer/special_tokens_map.json',
 'M:\\MACHINE LEARNING\\github\\sentiment analysis\\BERT_tokenizer/vocab.txt',
 'M:\\MACHINE LEARNING\\github\\sentiment analysis\\BERT_tokenizer/added_tokens.json',
 'M:\\MACHINE LEARNING\\github\\sentiment analysis\\BERT_tokenizer/tokenizer.json')

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(r"M:\MACHINE LEARNING\github\sentiment analysis\BERT_tokenizer")

In [12]:
train_string_data = X_train.astype(str).tolist()
test_string_data = X_test.astype(str).tolist()

# transformer takes the string list not a pandas series .
train_tokens = tokenizer(
    train_string_data,
    truncation = True,
    padding = True
    )


test_tokens = tokenizer(
    test_string_data,
    truncation = True,
    padding = True
    )


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader


# ✅ Create PyTorch Dataset
class TorchDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.tokens["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.tokens["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }


In [14]:
# ✅ Build Dataset objects
train_dataset = TorchDataset(train_tokens, y_train)
test_dataset  = TorchDataset(test_tokens,  y_test)

# ✅ Build DataLoaders (PyTorch version of batch/shuffle)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# fine tuning transformer for sentiment analysis.
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
output_label = 2 # possitive and negtive

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
).to(device)


training arguments will work if we use transformer

In [16]:
# ✅ TRAINING LOOP — 5 EPOCHS
from tqdm import tqdm
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Optimizer (equivalent to TF Adam lr=2e-5)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Loss function (equivalent to SparseCategoricalCrossentropy from_logits=True)
loss_fn = nn.CrossEntropyLoss()

# Accuracy metric
def accuracy(preds, labels):
    return (preds.argmax(dim=1) == labels).float().mean()

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_acc = 0

    print(f"\n===== Epoch {epoch + 1} / {epochs} =====")

    for batch in tqdm(train_loader):
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, batch["labels"])
        total_loss += loss.item()

        # Compute accuracy
        acc = accuracy(logits, batch["labels"])
        total_acc += acc

        # Backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Average results for epoch
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = total_acc / len(train_loader)

    print(f"Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1 / 5 =====


100%|██████████| 2480/2480 [06:14<00:00,  6.63it/s]


Train Loss: 0.2241 | Train Accuracy: 0.9097

===== Epoch 2 / 5 =====


100%|██████████| 2480/2480 [06:18<00:00,  6.55it/s]


Train Loss: 0.1201 | Train Accuracy: 0.9571

===== Epoch 3 / 5 =====


100%|██████████| 2480/2480 [06:14<00:00,  6.63it/s]


Train Loss: 0.0653 | Train Accuracy: 0.9782

===== Epoch 4 / 5 =====


100%|██████████| 2480/2480 [06:14<00:00,  6.63it/s]


Train Loss: 0.0384 | Train Accuracy: 0.9873

===== Epoch 5 / 5 =====


100%|██████████| 2480/2480 [06:14<00:00,  6.63it/s]

Train Loss: 0.0280 | Train Accuracy: 0.9911





In [17]:
torch.save(model.state_dict(), "finetuned.pt")

In [None]:
def predict(text):
    model.eval()
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    
    if "token_type_ids" in tokens:
        tokens.pop("token_type_ids")

    tokens = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**tokens)
        pred = outputs.logits.argmax(dim=1).item()

    return "Positive" if pred == 1 else "Negative"

print(predict("i dont love this movie"))


Negative


In [22]:
# load model back
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

model.load_state_dict(torch.load("/content/sentiment_finetuned.pt"))
model.to(device)
model.eval()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
