<a href="https://colab.research.google.com/github/pxs1990/NLP_LLM/blob/main/sentiment_movie_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nlpaug
!pip install transformers

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m410.5/410.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import TrainingArguments, Trainer, BertForSequenceClassification, BertTokenizer, AdamW

# Load the dataset
df = pd.read_csv('/content/movie_review_data.csv')

# Initialize the augmenter
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")

# Get the minority class samples
minority_class = df[df['label'] == df['label'].value_counts().idxmin()]

# Augment the minority class twice
augmented_texts = []
for _ in range(2):  # Augmenting/runs twice
    for txt in minority_class['text']:
        augmented_texts.extend(aug.augment(txt))

# Create a new DataFrame with the augmented texts and labels
augmented_df = pd.DataFrame({
    'text': augmented_texts,
    'label': [df['label'].value_counts().idxmin()] * len(augmented_texts)
})

# Append the augmented data to the original DataFrame
df = pd.concat([df, augmented_df])

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

# Tokenizing train_texts and val_texts
train_encodings = tokenize_function(train_texts.tolist())#dictionary with keys input_ids, attention_mask etc.
val_encodings = tokenize_function(val_texts.tolist())

# tensoring labels
train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())


In [None]:
# Define a custom subclss of Dataset class
class MovieReviewDataset(Dataset):
    def __init__(self, encoded_data, labeled_data):
        self.encodings = encoded_data
        self.labels = labeled_data

    def __getitem__(self, idx):# retrieves single data item from encoded data
        item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item


    def __len__(self):
        return len(self.labels)

# Create datasets instances with tokenized text and tensored labels
train_dataset = MovieReviewDataset(train_encodings, train_labels)
val_dataset = MovieReviewDataset(val_encodings, val_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)


steps=iteratiion

batch size = no of data passed to the model at one time during training

epoch = passing of data one complete time(forward and backward)

sequence=list of tokens for perticular row

training samples=1000, batch size= 20,then
No of iteration/steps= 1000/20 = 50 per epoch/complete data pass through model

pad = tokens; padding = max_length => increasing tokens no to max length

attention_mask=  gives attention to actual tokens and ignores padded tokens.

In [None]:
# Load pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when finished training
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None)
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.6289,0.639633
2,0.6485,0.632606
3,0.5656,0.621336


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=36, training_loss=0.6134590440326266, metrics={'train_runtime': 947.7786, 'train_samples_per_second': 0.56, 'train_steps_per_second': 0.038, 'total_flos': 34927992599040.0, 'train_loss': 0.6134590440326266, 'epoch': 3.0})

In [None]:
# Evaluate the model
results = trainer.evaluate()

# Print evaluation results
print("Evaluation Results:", results)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation Results: {'eval_loss': 0.621336042881012, 'eval_runtime': 25.593, 'eval_samples_per_second': 1.758, 'eval_steps_per_second': 0.117, 'epoch': 3.0}


# **Hyperparameter tuning:**

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m380.1/380.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m233.0/233.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 k

In [None]:
import optuna
from transformers import TrainingArguments

def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        optimizers=(optimizer, None)
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
print("Best hyperparameters:", study.best_params)


[I 2024-08-25 05:37:35,959] A new study created in memory with name: no-name-24675310-1bf1-4e61-8f35-ed01ff6f017c
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.6038,0.619135
2,0.6218,0.611543


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-08-25 05:48:12,644] Trial 0 finished with value: 0.6115429401397705 and parameters: {'learning_rate': 1.4545160968017746e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 0.6115429401397705.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
