In [2]:
!pip install transformers==4.21.1 -q
!pip install datasets==2.12.0 -q

!pip install accelerate -U -q
# !pip install torch
# !pip install transformers[torch]

In [3]:
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import warnings

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
!nvidia-smi

Tue Apr  2 21:22:34 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   48C    P5    26W /  30W |    424MiB /  8192MiB |     20%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:



class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


class ClassificationAccuracyMeasurement:
    def __init__(self, data_path, n_rows):
        self.data = pd.read_csv(data_path)
        
        self.data = shuffle(self.data)
        # self.data.sample(frac=1)
        self.data = self.data[:n_rows]
        self.metrics_name = 'f1'

        X = self.data.text
        y = self.data.sentiment
        y = pd.factorize(y)[0]

        # Load Metrics
        self.metric = load_metric(self.metrics_name)

        test_size = 0.2

        # Split Data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X.tolist(), y, test_size=test_size)
        self.max_length = 512
        self.num_epochs = 2
        self.num_labels = 2
    
    def calculate_accuracy(self,pred):
      from sklearn.metrics import accuracy_score, precision_recall_fscore_support

      labels = pred.label_ids
      preds = pred.predictions.argmax(-1)

      precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
      acc = accuracy_score(labels, preds)
      return {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'accuracy': acc,
        }

    # Create Metrics
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
        return self.metric.compute(predictions=predictions, references=labels, average="micro")

    def train(self, model_name):
        # Call the Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

        # Encode the text
        train_encodings = tokenizer(self.X_train, truncation=True, padding=True, max_length=self.max_length)
        valid_encodings = tokenizer(self.X_test, truncation=True, padding=True, max_length=self.max_length)
        # convert our tokenized data into a torch Dataset
        train_dataset = MakeTorchData(train_encodings, self.y_train.ravel())
        valid_dataset = MakeTorchData(valid_encodings, self.y_test.ravel())

        # Call Model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels).to(device)

        training_args = TrainingArguments(
            output_dir='./results',  # output directory
            num_train_epochs=self.num_epochs,  # total number of training epochs
            per_device_train_batch_size=8,  # batch size per device during training
            per_device_eval_batch_size=20,  # batch size for evaluation
            warmup_steps=500,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
            metric_for_best_model=self.metrics_name,  # select the base metrics
            logging_steps=200,  # log & save weights each logging_steps
            save_steps=200,
            evaluation_strategy="epoch",  # evaluate each `logging_steps`
            save_strategy='epoch',
            save_total_limit=1
        )

        # Call the Trainer
        trainer = Trainer(
            model=model,  # the instantiated Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=valid_dataset,  # evaluation dataset
            compute_metrics=self.compute_metrics,  # the callback that computes metrics of interest
        )

        # Train the model
        trainer.train()

        # Call the summary
        accuracy = trainer.evaluate()
        print(accuracy)
        with open(f'/home/shihab/learning_projects/bangla-sentence-transformer/REPORT/classification_report_news_article_{self.num_epochs}999.txt', 'a+') as tgt_file:
          tgt_file.write(f"{model_name} : F1: {accuracy['eval_f1']}\n")
        print(f"{model_name} : {accuracy['eval_f1']}")




In [11]:
# torch.cuda.empty_cache()







with torch.no_grad():
    torch.cuda.empty_cache()

In [12]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Apr  2 21:26:21 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   49C    P5    30W /  30W |   7878MiB /  8192MiB |     31%      Default |
|                               |            

In [7]:
if __name__ == '__main__':
    # df = pd.read_csv("DATA/bbc-text.csv")[:10]
    n_rows = 2882
    # n_rows = 100
    classifier = ClassificationAccuracyMeasurement('/home/shihab/learning_projects/bangla-sentence-transformer/DATA/bn_sentiment.csv', n_rows)
    # sen_transformers = ['sentence-transformers/all-mpnet-base-v2',
                        # 'sentence-transformers/stsb-xlm-r-multilingual',
                        # 'l3cube-pune/indic-sentence-similarity-sbert']
    # 'sentence-transformers/clip-ViT-B-32-multilingual-v1',

    # sen_transformers = ['sentence-transformers/stsb-xlm-r-multilingual','/content/drive/MyDrive/Bangla_NLP/transformer/bangla_snt','intfloat/multilingual-e5-large','sentence-transformers/LaBSE','sentence-transformers/distiluse-base-multilingual-cased-v1']
    sen_transformers = ['sentence-transformers/distiluse-base-multilingual-cased-v2','sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                        'sentence-transformers/quora-distilbert-multilingual']
    sen_transformers = ['bangla_snt']
    for sen_trans in sen_transformers:
        print(f'_____________________________\n{sen_trans}\n________________________________________')
        classifier.train(sen_trans)

_____________________________
bangla_snt
________________________________________


Some weights of the model checkpoint at bangla_snt were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at bangla_snt and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it

OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB. GPU 0 has a total capacity of 7.78 GiB of which 85.75 MiB is free. Including non-PyTorch memory, this process has 7.31 GiB memory in use. Of the allocated memory 6.38 GiB is allocated by PyTorch, and 230.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
"sentence-transformers/quora-distilbert-multilingual",

In [None]:
if __name__ == '__main__':
    # df = pd.read_csv("DATA/bbc-text.csv")[:10]
    n_rows = 100
    classifier = ClassificationAccuracyMeasurement('/home/shihab/learning_projects/bangla-sentence-transformer/DATA/bn_sentiment.csv', n_rows)
    # sen_transformers = ['','sentence-transformers/all-MiniLM-L6-v2',
                        # 'sentence-transformers/all-mpnet-base-v2',
                        # 'sentence-transformers/stsb-xlm-r-multilingual']

    sen_transformers = ['/content/drive/MyDrive/Bangla_NLP/transformer/bangla_snt','sentence-transformers/LaBSE','sentence-transformers/clip-ViT-B-32-multilingual-v1','sentence-transformers/all-MiniLM-L6-v2',
                        'sentence-transformers/all-mpnet-base-v2',
                        'sentence-transformers/stsb-xlm-r-multilingual','l3cube-pune/indic-sentence-bert-nli','l3cube-pune/indic-sentence-similarity-sbert']
    for sen_trans in sen_transformers:
        print(f'_____________________________{sen_trans}________________________________________')
        classifier.train(sen_trans)


    # sbert_trainer, sbert_model = TextClassification_with_Transformer(
    #     model_name='sentence-transformers/all-mpnet-base-v2',
    #     Data=df.text,
    #     Target=df.category,
    #     test_size=0.33,
    #     max_length=512,
    #     num_labels=5,
    #     num_epochs=3,
    #     metrics_name='f1')
    # print(0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')