In [2]:
!pip install transformers -U -q
!pip install accelerate -U -q
!pip install datasets -q
# !pip freeze | grep accelerate

In [5]:
!pip install scikit-learn -q
# sentence-transformers==2.2.2
# transformers==4.31.0
# !pip freeze | grep tokenizer

In [6]:
import torch
import random

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
!nvidia-smi

Mon Apr  1 21:04:04 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   52C    P8    13W /  80W |    406MiB /  8192MiB |     15%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [14]:
class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# class CustomMetricsCallback:
#     def __init__(self, num_labels):
#         self.num_labels = num_labels

#     def compute_metrics(self, pred):
#         labels = pred.label_ids
#         preds = pred.predictions.argmax(-1)

#         precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
#         acc = accuracy_score(labels, preds)

#         return {
#             'precision': precision,
#             'recall': recall,
#             'f1': f1,
#             'accuracy': acc,
#         }



class ClassificationAccuracyMeasurement:
    def __init__(self, data_path, n_rows):
        # self.data = pd.read_json(data_path)
        self.data = pd.read_csv(data_path)
        from sklearn.utils import shuffle
        self.data = shuffle(self.data)
        # self.data.sample(frac=1)
        self.data = self.data.groupby('category').head(500).reset_index(drop=True)

        print(self.data['category'].value_counts())

        # self.data = self.data[:n_rows]
        # le = LabelEncoder()
        self.le = LabelEncoder()
        self.data['category'] = self.le.fit_transform(self.data['category'])
        self.metrics_name = 'f1'

        X = self.data.clean_raw_text
        y = self.data.category
        y = pd.factorize(y)[0]
        print(y)

        # Load Metrics
        self.metric = load_metric(self.metrics_name)

        test_size = 0.2

        # Split Data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X.tolist(), y, test_size=test_size)
        self.max_length = 512
        self.num_epochs = 2
        self.num_labels = self.data['category'].nunique()

    def calculate_accuracy(self,pred):
      from sklearn.metrics import accuracy_score, precision_recall_fscore_support

      labels = pred.label_ids
      preds = pred.predictions.argmax(-1)

      precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
      acc = accuracy_score(labels, preds)
      return {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'accuracy': acc,
        }

    # Create Metrics
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)


        # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
        return self.metric.compute(predictions=predictions, references=labels, average="micro")

    def train(self, model_name):
        # Call the Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

        # Encode the text
        train_encodings = tokenizer(self.X_train, truncation=True, padding=True, max_length=self.max_length)
        valid_encodings = tokenizer(self.X_test, truncation=True, padding=True, max_length=self.max_length)
        # convert our tokenized data into a torch Dataset
        train_dataset = MakeTorchData(train_encodings, self.y_train.ravel())
        valid_dataset = MakeTorchData(valid_encodings, self.y_test.ravel())

        # Call Model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels).to(device)

        training_args = TrainingArguments(
            output_dir='/racism_results',  # output directory
            num_train_epochs=self.num_epochs,  # total number of training epochs
            per_device_train_batch_size=8,  # batch size per device during training
            per_device_eval_batch_size=20,  # batch size for evaluation
            warmup_steps=500,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='/content/drive/MyDrive/Bangla_NLP/transformer/MODELS/logs',  # directory for storing logs
            load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
            metric_for_best_model=self.metrics_name,  # select the base metrics
            logging_steps=200,  # log & save weights each logging_steps
            evaluation_strategy="epoch",  # evaluate each `logging_steps`
            save_strategy='epoch',
            save_total_limit = 1
        )

        # Call the Trainer
        trainer = Trainer(
            model=model,  # the instantiated Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=valid_dataset,  # evaluation dataset
            compute_metrics=self.calculate_accuracy,  # the callback that computes metrics of interest
        )

        # Train the model
        trainer.train()

        # Call the summary
        accuracy = trainer.evaluate()
        # print(res)
        # accuracy = self.calculate_accuracy(valid_dataset)
        # print()
        with open(f'/home/shihab/learning_projects/bangla-sentence-transformer/REPORT/classification_report_news_article_{self.num_epochs}99.txt', 'a+') as tgt_file:
          tgt_file.write(f"{model_name} : Precision: {accuracy['eval_precision']}, Recall: {accuracy['eval_recall']}, F1: {accuracy['eval_f1']}, Accuracy: {accuracy['eval_accuracy']}\n")
        # print(f"{model_name} : Precision: {accuracy['precision']}, Recall: {accuracy['recall']}, F1: {accuracy['f1']}, Accuracy: {accuracy['accuracy']}")
        # with open('/content/drive/MyDrive/Bangla_NLP/LDA/REPORT/classification_report_racism.txt', 'a+') as tgt_file:
        #     tgt_file.write(f"{model_name} : {res['eval_f1']}\n")
        # print(f"{model_name} : {res['eval_f1']}")





In [15]:
if __name__ == '__main__':
    # df = pd.read_csv("DATA/bbc-text.csv")[:10]
    n_rows = 2882
    n_rows = 100
    classifier = ClassificationAccuracyMeasurement('/home/shihab/learning_projects/bangla-sentence-transformer/DATA/bn_sentiment.csv', n_rows)
    # sen_transformers = ['sentence-transformers/all-mpnet-base-v2',
                        # 'sentence-transformers/stsb-xlm-r-multilingual',
                        # 'l3cube-pune/indic-sentence-similarity-sbert']

    # sen_transformers = ['sentence-transformers/stsb-xlm-r-multilingual','/content/drive/MyDrive/Bangla_NLP/transformer/bangla_snt','intfloat/multilingual-e5-large','sentence-transformers/LaBSE','sentence-transformers/distiluse-base-multilingual-cased-v1']
    sen_transformers = ['sentence-transformers/clip-ViT-B-32-multilingual-v1','sentence-transformers/LaBSE','sentence-transformers/distiluse-base-multilingual-cased-v2','sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2']
    for sen_trans in sen_transformers:
        print(f'_____________________________{sen_trans}________________________________________')
        classifier.train(sen_trans)

KeyError: 'category'

In [None]:
data_json = pd.read_json('/content/drive/MyDrive/Bangla_NLP/LDA/DATA/bangla_news_article.json')
data_json.head()

Unnamed: 0,category,raw_text,clean_raw_text
0,technology,"দরজা, দেয়াল, মাটিসহ আশপাশের জড়বস্তুকে জীবন্ত ...",দরজা দেয়াল মাটিসহ আশপাশের জড়বস্ত...
1,economy,জাতীয় সংসদে আজ মঙ্গলবার ২০১৫-১৬ অর্থবছরের বাজে...,জাতীয় সংসদে মঙ্গলবার অর্থবছরের ব...
2,sports,ম্যানচেস্টার সিটি কোচ পেপ গার্দিওলা জাতীয়তাবাদ...,ম্যানচেস্টার সিটি কোচ পেপ গার্দি...
3,technology,ফেসবুক! এই অনলাইনের যুগে ফেসবুকের সঙ্গে পরিচি...,ফেসবুক অনলাইনের যুগে ফেসবুকের পর...
4,রাজনীতি,চট্টগ্রামের হাটহাজারীতে সহিংসতার ঘটনায় হেফাজতে...,চট্টগ্রামের হাটহাজারীতে সহিংসতার ঘটনায় হেফাজতে...


In [None]:
data_json['category'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_json['category'] = le.fit_transform(data_json['category'])
le.classes_

In [None]:
data_json.head()