In [None]:
!pip install transformers



In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, DataLoaderConfiguration
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
#Define DataLoader configurations
data_loader_config = DataLoaderConfiguration(
    split_batches=False,
    even_batches=True,
    use_seedable_sampler=True
)

#Initialize Accelerator with the new DataLoader configuration
accelerator = Accelerator(dataloader_config=data_loader_config)

In [None]:
#Initialize tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Ensure model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
#Load and prepare data
df = pd.read_csv('/content/Multi_Languages.csv')
df.head(10)

Unnamed: 0,id,text,label,sentiment,language
0,9536,"Cooking microwave pizzas, yummy",2,positive,English
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,English
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,English
3,14182,naw idk what ur talkin about,1,neutral,English
4,17840,That sucks to hear. I hate days like that,0,negative,English
5,3655,Umm yeah. That`s probably a pretty good note ...,2,positive,English
6,719,whatever do you mean?,1,neutral,English
7,22823,That would panic me a little! Maybe you can ...,0,negative,English
8,4869,Is sad when people`s phones are dead,0,negative,English
9,793,sad face.,0,negative,English


In [None]:
#replacing label values
replacement_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['label'].replace(replacement_map)

In [None]:
df.head(10)

Unnamed: 0,id,text,label,sentiment,language
0,9536,"Cooking microwave pizzas, yummy",2,positive,English
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,English
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,English
3,14182,naw idk what ur talkin about,1,neutral,English
4,17840,That sucks to hear. I hate days like that,0,negative,English
5,3655,Umm yeah. That`s probably a pretty good note ...,2,positive,English
6,719,whatever do you mean?,1,neutral,English
7,22823,That would panic me a little! Maybe you can ...,0,negative,English
8,4869,Is sad when people`s phones are dead,0,negative,English
9,793,sad face.,0,negative,English


In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
#Tokenization function that pads to the maximum length of 64 tokens
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=64)

train_encodings = tokenize_function(X_train.tolist())
test_encodings = tokenize_function(X_test.tolist())

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Ensure tensors are created on the CPU
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())

In [None]:
#When loading data, specify pinning memory
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, pin_memory=True, num_workers=4)



In [None]:
#Configure Accelerator and DataLoader
data_loader_config = DataLoaderConfiguration(split_batches=False, even_batches=True, use_seedable_sampler=True)
accelerator = Accelerator(dataloader_config=data_loader_config)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'eval_accuracy': acc,  # Ensure this key matches the metric_for_best_model in TrainingArguments
        'eval_f1': f1,
        'eval_precision': precision,
        'eval_recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',  # This should match exactly with a key returned in compute_metrics
    fp16=True,
    gradient_accumulation_steps=4,
    report_to="none"  # Optional: set to "none" to disable logging to any external entity
)



In [None]:
#Initialize Accelerator
accelerator = Accelerator()

#Prepare everything with our `accelerator`.
model, optimizer, train_dataset, test_dataset = accelerator.prepare(
    model, torch.optim.Adam(model.parameters(), lr=2e-5), train_dataset, test_dataset
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
#Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.6071,0.595836,0.748519,0.749843,0.753999,0.748519
1,0.5046,0.629357,0.753802,0.755661,0.759957,0.753802
2,0.3864,0.704406,0.745158,0.746819,0.750232,0.745158
4,0.1989,1.150365,0.735073,0.733213,0.734198,0.735073
5,0.1449,1.33915,0.737794,0.738715,0.74008,0.737794
6,0.1256,1.415935,0.731871,0.733411,0.736196,0.731871
8,0.0722,1.964597,0.735713,0.735814,0.735924,0.735713
9,0.0676,2.148607,0.737634,0.736654,0.736416,0.737634
10,0.0593,2.206077,0.739555,0.740025,0.740867,0.739555
12,0.028,2.467372,0.733632,0.733396,0.73399,0.733632


TrainOutput(global_step=31220, training_loss=0.13370873901723365, metrics={'train_runtime': 4430.1684, 'train_samples_per_second': 112.795, 'train_steps_per_second': 7.047, 'total_flos': 8270468031291264.0, 'train_loss': 0.13370873901723365, 'epoch': 19.990395389787096})

In [None]:
#Save the model and tokenizer
model_path = "./distilbert-finetuned-sentiment"
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_path, save_function=accelerator.save)
tokenizer.save_pretrained(model_path)

print("Training complete and model saved.")

Training complete and model saved.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }




In [None]:
#Initialize Trainer with evaluation metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Add this line to include evaluation metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.6293569803237915, 'eval_model_preparation_time': 0.0021, 'eval_accuracy': 0.7538018248759405, 'eval_f1': 0.7556606682747917, 'eval_precision': 0.7599567481173649, 'eval_recall': 0.7538018248759405, 'eval_runtime': 13.8346, 'eval_samples_per_second': 451.55, 'eval_steps_per_second': 112.906}


In [None]:
#Define a function for testing individual inputs
def test_model(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    #Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        # Import torch.nn.functional and use it to calculate softmax
        import torch.nn.functional as F
        probabilities = F.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()

    #Convert predicted class index back to sentiment label
    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}
    predicted_label = sentiment_map[predicted_class]

    print(f"Input Text: {input_text}")
    print(f"Logits: {logits.cpu().numpy()}")
    print(f"Probabilities: {probabilities.cpu().numpy()}")
    print(f"Predicted Sentiment: {predicted_label}\n")

#Sample inputs from the test set or custom sentences
sample_texts = [
    "I love this product! It works great.",
    "The service was okay, nothing special.",
    "I'm very disappointed with the quality.",
    " Not bad",
    "Better improve"
]

#Run the test on each sample input
for text in sample_texts:
    test_model(text)


Input Text: I love this product! It works great.
Logits: [[-2.1035156 -0.7421875  3.9511719]]
Probabilities: [[0.00232015 0.00905176 0.9886281 ]]
Predicted Sentiment: positive

Input Text: The service was okay, nothing special.
Logits: [[-0.60058594  1.0126953  -0.87890625]]
Probabilities: [[0.14757298 0.74070626 0.11172076]]
Predicted Sentiment: neutral

Input Text: I'm very disappointed with the quality.
Logits: [[ 2.4824219  -0.06011963 -2.4628906 ]]
Probabilities: [[0.92099446 0.07245115 0.00655443]]
Predicted Sentiment: negative

Input Text:  Not bad
Logits: [[-2.1210938   0.47753906  1.9404297 ]]
Probabilities: [[0.01379157 0.18543243 0.80077595]]
Predicted Sentiment: positive

Input Text: Better improve
Logits: [[-1.2685547   0.7133789   0.85839844]]
Probabilities: [[0.06007439 0.43594548 0.50398004]]
Predicted Sentiment: positive

