In [None]:
!pip install transformers



In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, DataLoaderConfiguration
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Define DataLoader configurations
data_loader_config = DataLoaderConfiguration(
    split_batches=False,
    even_batches=True,
    use_seedable_sampler=True
)

# Initialize Accelerator with the new DataLoader configuration
accelerator = Accelerator(dataloader_config=data_loader_config)

In [None]:
# Initialize tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Ensure model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Load and prepare data
df = pd.read_csv('/content/Multi_Languages.csv')
df.head(10)

Unnamed: 0,id,text,label,sentiment,language
0,9536,"Cooking microwave pizzas, yummy",2,positive,English
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,English
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,English
3,14182,naw idk what ur talkin about,1,neutral,English
4,17840,That sucks to hear. I hate days like that,0,negative,English
5,3655,Umm yeah. That`s probably a pretty good note ...,2,positive,English
6,719,whatever do you mean?,1,neutral,English
7,22823,That would panic me a little! Maybe you can ...,0,negative,English
8,4869,Is sad when people`s phones are dead,0,negative,English
9,793,sad face.,0,negative,English


In [None]:
#replacing label values
replacement_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['label'].replace(replacement_map)

In [None]:
df.head(10)

Unnamed: 0,id,text,label,sentiment,language
0,9536,"Cooking microwave pizzas, yummy",2,positive,English
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,English
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,English
3,14182,naw idk what ur talkin about,1,neutral,English
4,17840,That sucks to hear. I hate days like that,0,negative,English
5,3655,Umm yeah. That`s probably a pretty good note ...,2,positive,English
6,719,whatever do you mean?,1,neutral,English
7,22823,That would panic me a little! Maybe you can ...,0,negative,English
8,4869,Is sad when people`s phones are dead,0,negative,English
9,793,sad face.,0,negative,English


In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Tokenization function that pads to the maximum length of 64 tokens
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=64)

train_encodings = tokenize_function(X_train.tolist())
test_encodings = tokenize_function(X_test.tolist())

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Ensure tensors are created on the CPU
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())

In [None]:
# When loading data, specify pinning memory
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, pin_memory=True, num_workers=4)



In [None]:
# Configure Accelerator and DataLoader
data_loader_config = DataLoaderConfiguration(split_batches=False, even_batches=True, use_seedable_sampler=True)
accelerator = Accelerator(dataloader_config=data_loader_config)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'eval_accuracy': acc,  # Ensure this key matches the metric_for_best_model in TrainingArguments
        'eval_f1': f1,
        'eval_precision': precision,
        'eval_recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',  # This should match exactly with a key returned in compute_metrics
    fp16=True,
    gradient_accumulation_steps=4,
    report_to="none"  # Optional: set to "none" to disable logging to any external entity
)



In [None]:
# Initialize Accelerator
accelerator = Accelerator()

# Prepare everything with our `accelerator`.
model, optimizer, train_dataset, test_dataset = accelerator.prepare(
    model, torch.optim.Adam(model.parameters(), lr=2e-5), train_dataset, test_dataset
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.611,0.599862,0.748039,0.749307,0.754778,0.748039
1,0.5034,0.633127,0.752841,0.754864,0.759938,0.752841
2,0.3865,0.712771,0.74916,0.751136,0.75683,0.74916
4,0.1994,1.129738,0.735393,0.734226,0.73464,0.735393
5,0.1497,1.387917,0.739875,0.739444,0.739799,0.739875
6,0.129,1.542459,0.736353,0.737488,0.739327,0.736353
8,0.0861,1.924235,0.737954,0.739488,0.742438,0.737954
9,0.0628,2.179477,0.735873,0.736708,0.73795,0.735873
10,0.0487,2.15681,0.732992,0.73285,0.733906,0.732992
12,0.0218,2.573983,0.732992,0.734128,0.737994,0.732992


TrainOutput(global_step=31220, training_loss=0.13502684314612926, metrics={'train_runtime': 4494.4092, 'train_samples_per_second': 111.183, 'train_steps_per_second': 6.946, 'total_flos': 8270468031291264.0, 'train_loss': 0.13502684314612926, 'epoch': 19.990395389787096})

In [None]:
# Save the model and tokenizer
model_path = "./distilbert-finetuned-sentiment"
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_path, save_function=accelerator.save)
tokenizer.save_pretrained(model_path)

print("Training complete and model saved.")

Training complete and model saved.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }




In [None]:
# Initialize Trainer with evaluation metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Add this line to include evaluation metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.6331266164779663, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.7528413638546503, 'eval_f1': 0.7548644119323475, 'eval_precision': 0.7599379452180659, 'eval_recall': 0.7528413638546503, 'eval_runtime': 14.1925, 'eval_samples_per_second': 440.161, 'eval_steps_per_second': 110.058}
