In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install transformers

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

from transformers import AutoModelForPreTraining, AutoTokenizer, AutoModelForSequenceClassification
from normalizer import normalize

import numpy as np
import pandas as pd

import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
rand_seed = 6
np.random.seed(rand_seed)
random.seed(rand_seed)
torch.manual_seed(rand_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(rand_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglishbert")

Downloading (…)okenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/366k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
data = pd.read_excel('/content/drive/Shareddrives/TFQ/MemeSEN/multi-sent.xlsx')

label_map = {'neutral': 0, 'positive': 1, 'negative': 2}

Xc = data['Captions'].tolist()
Y = [label_map[i] for i in data['Label_Sentiment']]

In [None]:
Xc_train, Xc_test, Y_train, Y_test = train_test_split(Xc, Y, test_size=0.3, random_state=6, stratify=Y)
Xc_test, Xc_valid, Y_test, Y_valid = train_test_split(Xc_test, Y_test, test_size=1/3, random_state=6, stratify=Y_test)

In [None]:
length = []
for i in data['Captions']:
    length.append(len(tokenizer.tokenize(i)))
print('100% Coverage Token Length:', max(length))
print('99.8% Coverage Token Length:', np.percentile(length, 99.8))

100% Coverage Token Length: 154
99.8% Coverage Token Length: 83.0


In [None]:
class Sampler(object):
    def __init__(self, data_source):
        pass

    def __iter__(self):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

class StratifiedSampler(Sampler):
    def __init__(self, class_vector, batch_size):
        self.n_splits = int(class_vector.size(0) / batch_size)
        self.class_vector = class_vector

    def gen_sample_array(self):
        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = torch.randn(self.class_vector.size(0),2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])

    def __iter__(self):
        return iter(self.gen_sample_array())

    def __len__(self):
        return len(self.class_vector)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            normalize(text),
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            truncation=True,
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label
        }


In [None]:
batch_size=128

sampler = StratifiedSampler(class_vector=torch.tensor(Y_train), batch_size=batch_size)
train_loader = DataLoader(CustomDataset(Xc_train, Y_train, tokenizer, max_length=85), batch_size=batch_size, sampler=sampler)
valid_loader = DataLoader(CustomDataset(Xc_valid, Y_valid, tokenizer, max_length=85), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(CustomDataset(Xc_test, Y_test, tokenizer, max_length=85), batch_size=batch_size, shuffle=False)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.num_layers = num_layers;
        self.hidden_dim = hidden_dim;
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(device)

        output, _ = self.lstm(x, (h0, c0))
        # print(output[:,-1,:].shape)
        logits = self.fc(output[:, -1, :])
        return logits

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('csebuetnlp/banglishbert', num_labels=3)
# model = LSTMModel(tokenizer.vocab_size, 128, 128, 2, 3)
model.to(device)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=0.00001,
                              betas=(0.9, 0.9999),
                              eps=1e-09,
                              weight_decay=0.08)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [None]:
path = '/content/drive/Shareddrives/TFQ/Model_Checkpoints/BanglishBERT/'
def train_model(model, start, end, name):
  for epoch in range(start, end):
      model.train()
      running_loss = 0.0
      for inputs in train_loader:
          input_ids = inputs['input_ids'].to(device)
          attention_mask = inputs['attention_mask'].to(device)
          labels = inputs['label'].to(device)
          optimizer.zero_grad()
          outputs = model(input_ids, attention_mask=attention_mask)
          # print(outputs.shape)
          loss = criterion(outputs.logits, labels)
          loss.backward()
          optimizer.step()
          running_loss = loss.item()

      model.eval()
      correct = 0
      total = 0
      with torch.no_grad():
          for inputs in valid_loader:
              input_ids = inputs['input_ids'].to(device)
              attention_mask = inputs['attention_mask'].to(device)
              labels = inputs['label'].to(device)
              outputs = model(input_ids, attention_mask=attention_mask)
              _, predicted = torch.max(outputs.logits, 1)
              total += labels.size(0)
              correct += (predicted == labels).sum().item()

      val_accuracy = correct / total
      torch.save(model.state_dict(), path + f'{name}_{epoch + 1}.pkl')
      print(f"Epoch {epoch + 1}/{end}, Loss: {running_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

In [None]:
train_model(model, 0, 20, 'BanglishBERT')

Epoch 1/20, Loss: 0.8214, Validation Accuracy: 0.6247
Epoch 2/20, Loss: 0.7585, Validation Accuracy: 0.6888
Epoch 3/20, Loss: 0.7035, Validation Accuracy: 0.6865
Epoch 4/20, Loss: 0.7615, Validation Accuracy: 0.6201
Epoch 5/20, Loss: 0.5441, Validation Accuracy: 0.6865
Epoch 6/20, Loss: 0.4207, Validation Accuracy: 0.6957
Epoch 7/20, Loss: 0.4625, Validation Accuracy: 0.6522
Epoch 8/20, Loss: 0.2875, Validation Accuracy: 0.6888
Epoch 9/20, Loss: 0.3562, Validation Accuracy: 0.6842
Epoch 10/20, Loss: 0.3036, Validation Accuracy: 0.6796
Epoch 11/20, Loss: 0.3715, Validation Accuracy: 0.6934
Epoch 12/20, Loss: 0.2190, Validation Accuracy: 0.6270
Epoch 13/20, Loss: 0.0585, Validation Accuracy: 0.6087
Epoch 14/20, Loss: 0.0927, Validation Accuracy: 0.6453
Epoch 15/20, Loss: 0.0798, Validation Accuracy: 0.6407
Epoch 16/20, Loss: 0.0960, Validation Accuracy: 0.6476
Epoch 17/20, Loss: 0.0459, Validation Accuracy: 0.5995
Epoch 18/20, Loss: 0.0996, Validation Accuracy: 0.6407
Epoch 19/20, Loss: 

In [None]:
def get_report(model, weight):
    model.load_state_dict(torch.load(weight))
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs in test_loader:
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = inputs['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())
    return f'{confusion_matrix(y_true, y_pred)}\n{classification_report(y_true, y_pred)}'

In [None]:
print(get_report(model, '/content/drive/Shareddrives/TFQ/Model_Checkpoints/BanglishBERT/BanglishBERT_19.pkl'))

[[  6  17  35]
 [  8 169  93]
 [ 28 114 404]]
              precision    recall  f1-score   support

           0       0.14      0.10      0.12        58
           1       0.56      0.63      0.59       270
           2       0.76      0.74      0.75       546

    accuracy                           0.66       874
   macro avg       0.49      0.49      0.49       874
weighted avg       0.66      0.66      0.66       874



In [None]:
print(get_report(model, '/content/drive/Shareddrives/TFQ/Model_Checkpoints/BanglishBERT/BanglishBERT_11.pkl'))

[[  1  15  42]
 [  7 168  95]
 [ 19 112 415]]
              precision    recall  f1-score   support

           0       0.04      0.02      0.02        58
           1       0.57      0.62      0.59       270
           2       0.75      0.76      0.76       546

    accuracy                           0.67       874
   macro avg       0.45      0.47      0.46       874
weighted avg       0.65      0.67      0.66       874

