In [1]:
!pip install transformers -U -q
!pip install accelerate -U -q
!pip install datasets -q
# !pip freeze | grep accelerate

In [2]:
!pip install scikit-learn -q
# sentence-transformers==2.2.2
# transformers==4.31.0
# !pip freeze | grep tokenizer

In [3]:
import torch
import random

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
!nvidia-smi

Sun Apr 21 20:16:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2070 ...    Off | 00000000:01:00.0  On |                  N/A |
| N/A   58C    P8              18W /  80W |    580MiB /  8192MiB |     38%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# class CustomMetricsCallback:
#     def __init__(self, num_labels):
#         self.num_labels = num_labels

#     def compute_metrics(self, pred):
#         labels = pred.label_ids
#         preds = pred.predictions.argmax(-1)

#         precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
#         acc = accuracy_score(labels, preds)

#         return {
#             'precision': precision,
#             'recall': recall,
#             'f1': f1,
#             'accuracy': acc,
#         }



class ClassificationAccuracyMeasurement:
    def __init__(self, data_path, n_rows):
        # self.data = pd.read_json(data_path)
        self.n_rows = n_rows
        self.data = pd.read_csv(data_path)
        from sklearn.utils import shuffle
        self.data.rename(columns={'text': 'clean_raw_text','sentiment':'category'}, inplace=True)
        self.data = shuffle(self.data)
        # self.data.sample(frac=1)
        self.data = self.data.groupby('category').head(1441).reset_index(drop=True)

        print(self.data['category'].value_counts())

        # self.data = self.data[:n_rows]
        # le = LabelEncoder()
        self.le = LabelEncoder()
        self.data['category'] = self.le.fit_transform(self.data['category'])
        self.metrics_name = 'f1'

        X = self.data.clean_raw_text
        y = self.data.category
        y = pd.factorize(y)[0]
        print(y)

        # Load Metrics
        self.metric = load_metric(self.metrics_name)

        test_size = 0.2

        # Split Data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X.tolist(), y, test_size=test_size)
        self.max_length = 512
        self.num_epochs = 5
        self.num_labels = self.data['category'].nunique()

    def calculate_accuracy(self,pred):
      from sklearn.metrics import accuracy_score, precision_recall_fscore_support

      labels = pred.label_ids
      preds = pred.predictions.argmax(-1)

      precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
      acc = accuracy_score(labels, preds)
      return {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'accuracy': acc,
        }

    # Create Metrics
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)


        # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
        return self.metric.compute(predictions=predictions, references=labels, average="micro")

    def train(self, model_name):
        # Call the Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

        # Encode the text
        train_encodings = tokenizer(self.X_train, truncation=True, padding=True, max_length=self.max_length)
        valid_encodings = tokenizer(self.X_test, truncation=True, padding=True, max_length=self.max_length)
        # convert our tokenized data into a torch Dataset
        train_dataset = MakeTorchData(train_encodings, self.y_train.ravel())
        valid_dataset = MakeTorchData(valid_encodings, self.y_test.ravel())

        # Call Model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels).to(device)

        training_args = TrainingArguments(
            output_dir='racism_results',  # output directory
            num_train_epochs=self.num_epochs,  # total number of training epochs
            per_device_train_batch_size=8,  # batch size per device during training
            per_device_eval_batch_size=20,  # batch size for evaluation
            warmup_steps=500,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='/content/drive/MyDrive/Bangla_NLP/transformer/MODELS/logs',  # directory for storing logs
            load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
            metric_for_best_model=self.metrics_name,  # select the base metrics
            logging_steps=200,  # log & save weights each logging_steps
            evaluation_strategy="epoch",  # evaluate each `logging_steps`
            save_strategy='epoch',
            save_total_limit = 1
        )

        # Call the Trainer
        trainer = Trainer(
            model=model,  # the instantiated Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=valid_dataset,  # evaluation dataset
            compute_metrics=self.calculate_accuracy,  # the callback that computes metrics of interest
        )

        # Train the model
        trainer.train()

        # Call the summary
        accuracy = trainer.evaluate()
        # print(res)
        # accuracy = self.calculate_accuracy(valid_dataset)
        # print()
        with open(f'/home/shihab/learning_projects/bangla-sentence-transformer/REPORT/classification_report_news_article_{self.num_epochs}_{self.n_rows}09.txt', 'a+') as tgt_file:
          tgt_file.write(f"{model_name} : Precision: {accuracy['eval_precision']}, Recall: {accuracy['eval_recall']}, F1: {accuracy['eval_f1']}, Accuracy: {accuracy['eval_accuracy']}\n")
        # print(f"{model_name} : Precision: {accuracy['precision']}, Recall: {accuracy['recall']}, F1: {accuracy['f1']}, Accuracy: {accuracy['accuracy']}")
        # with open('/content/drive/MyDrive/Bangla_NLP/LDA/REPORT/classification_report_racism.txt', 'a+') as tgt_file:
        #     tgt_file.write(f"{model_name} : {res['eval_f1']}\n")
        # print(f"{model_name} : {res['eval_f1']}")





In [7]:
if __name__ == '__main__':
    # df = pd.read_csv("DATA/bbc-text.csv")[:10]
    n_rows = 2882
    # n_rows = 100
    classifier = ClassificationAccuracyMeasurement('DATA/bn_sentiment.csv', n_rows)
    # sen_transformers = ['sentence-transformers/all-mpnet-base-v2',
                        # 'sentence-transformers/stsb-xlm-r-multilingual',
                        # 'l3cube-pune/indic-sentence-similarity-sbert']

    # sen_transformers = ['sentence-transformers/stsb-xlm-r-multilingual','/content/drive/MyDrive/Bangla_NLP/transformer/bangla_snt','intfloat/multilingual-e5-large','sentence-transformers/LaBSE','sentence-transformers/distiluse-base-multilingual-cased-v1']
    sen_transformers = ['sartifyllc/AviLaBSE','sentence-transformers/clip-ViT-B-32-multilingual-v1','sentence-transformers/LaBSE','sentence-transformers/distiluse-base-multilingual-cased-v2','sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2']
    sen_transformers = ['/home/shihab/learning_projects/bangla-sentence-transformer/transformer_model_2']
    for sen_trans in sen_transformers:
        print(f'_____________________________{sen_trans}________________________________________')
        classifier.train(sen_trans)

category
1    1441
0    1441
Name: count, dtype: int64
[0 1 1 ... 0 0 0]


_____________________________/home/shihab/learning_projects/bangla-sentence-transformer/transformer_model_2________________________________________


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /home/shihab/learning_projects/bangla-sentence-transformer/transformer_model_2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/1445 [00:01<32:53,  1.37s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 7.78 GiB of which 49.38 MiB is free. Including non-PyTorch memory, this process has 7.21 GiB memory in use. Of the allocated memory 6.85 GiB is allocated by PyTorch, and 238.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
data_json = pd.read_json('/content/drive/MyDrive/Bangla_NLP/LDA/DATA/bangla_news_article.json')
data_json.head()

FileNotFoundError: File /content/drive/MyDrive/Bangla_NLP/LDA/DATA/bangla_news_article.json does not exist

In [None]:
data_json['category'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_json['category'] = le.fit_transform(data_json['category'])
le.classes_

In [None]:
data_json.head()