In [None]:
import torch
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from nltk.tokenize import WhitespaceTokenizer
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
drive.mount('/content/drive/')
import string
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import copy

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!ls drive/MyDrive/others/Data/Validation
! ls drive/MyDrive/others/Test/bangla

valid_Bangla.csv  valid_Hindi.csv  valid_Magahi.csv
test.csv


## Data Loading And Preprocessing

In [None]:
train_df = pd.read_csv('drive/MyDrive/others/Data/Training/train_Bangla.csv')
train_df['sentence'] = train_df['sentence'].str.lower()
valid_df = pd.read_csv('drive/MyDrive/others/Data/Validation/valid_Bangla.csv')
valid_df['sentence'] = valid_df['sentence'].str.lower()
test_df = pd.read_csv('drive/MyDrive/others/Test/bangla/test.csv')
test_df['sentence'] = valid_df['sentence'].str.lower()

tokenizer = WhitespaceTokenizer()

punct = string.punctuation

def remove_punct_one(txt):
  for c in list(punct):
    txt = txt.replace(c, "")
  return txt

train_df['sentence'] = train_df['sentence'].apply(remove_punct_one)
valid_df['sentence'] = valid_df['sentence'].apply(remove_punct_one)
test_df['sentence'] = test_df['sentence'].apply(remove_punct_one)

### Vocab implementation on pre-processed dataset

In [None]:
def prepare_datasets_and_vocab(data_frame):
    ## all the files for MEMD-ABSA
    sentences = list(map(lambda x: tokenizer.tokenize(x), data_frame['sentence']))
    vocab = build_vocab_from_iterator(sentences, specials=["<unk>"])
    vocab.set_default_index(vocab['<unk>'])
    return vocab

In [None]:
vocab = prepare_datasets_and_vocab(train_df)
print(f"Unique Words: {len(vocab)}")

Unique Words: 14287


In [None]:
print('Senitment Labels:')
train_df['sentiment'].value_counts()

Senitment Labels:


Positive    293
Negative    247
Neutral     163
Name: sentiment, dtype: int64

## Model Architecture
Embedding size: 100

BI-LSTM: 64*2

Fully Connected: 128

Fully Connected: 3

In [None]:
class SentimentModel(torch.nn.Module):
    def __init__(self, vocab_size, emb_size):
        '''double embedding + lstm encoder + dot self attention'''
        super(SentimentModel, self).__init__()

        self.gen_embedding = torch.nn.Embedding(vocab_size, emb_size)

        # self.dropout1 = torch.nn.Dropout(0.5)
        # self.dropout2 = torch.nn.Dropout(0)

        ## input size 400, output size 2 x 50
        self.bilstm = torch.nn.LSTM(emb_size, 64,
                                    num_layers=3, batch_first=True, bidirectional=True)
        self.fc1 = torch.nn.Linear(128, 64)

        self.class_layer = torch.nn.Linear(64, 3)


    def forward(self, X):
        emb = self.gen_embedding(X)

        output, (h_n, c_n) =  self.bilstm(emb)
        # print(output.shape)
        # fc1 = self.fc1(F.relu(output[:, -1, :]))
        fc1 = self.fc1(output[:, -1, :])
        classout = self.class_layer(fc1)
        return classout



In [None]:
class SentDataset(Dataset):
  def __init__(self, data_df, tokenizer, vocab, maxlen=128):
    self.tokenizer = tokenizer
    self.vocab = vocab
    self.data_df = data_df
    self.maxlen = maxlen
    self.label_encoding = { # add more if more sentiment labels are present
        "Negative": 0,
        "Neutral": 1,
        "Positive":2,
    }
    self.punct = string.punctuation
    self.encoded_data = []
    self._build()

  def _build(self):
    for sentence, label in self.data_df.values:
      enc_tokens = [0] * self.maxlen
      tokens = self.tokenizer.tokenize(sentence)
      if len(tokens)<=128:
        enc_tokens[:len(tokens)] = self.vocab(tokens)
      else:
        enc_tokens[:128] = self.vocab(tokens)[:128]
      lab = self.label_encoding[label]
      self.encoded_data.append([torch.tensor(enc_tokens),  torch.tensor(lab)])

  def __getitem__(self, index):
     return self.encoded_data[index]

  def __len__(self):
    return len(self.encoded_data)


In [None]:
train_dataset = SentDataset(train_df, tokenizer, vocab)
valid_dataset = SentDataset(valid_df, tokenizer, vocab)
test_dataset = SentDataset(test_df, tokenizer, vocab)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32)
valid_dataloader = DataLoader(valid_dataset, batch_size=32)
model = SentimentModel(len(vocab), 150)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)
best_model = None


In [None]:
def eval_loop(pred_model, dataloader):
  valid_ite = tqdm(dataloader)
  preds = []
  golds = []
  pred_model.eval()
  with torch.no_grad():
    for batch in valid_ite:
      x, y = batch
      pred_out = pred_model(x)
      preds += pred_out.argmax(dim=-1).tolist()
      golds += y.tolist()
    precise_macro = precision_score(golds, preds, average='macro', zero_division=0)
    recall_macro = recall_score(golds, preds, average='macro', zero_division=0)
    f1_macro = f1_score(golds, preds, average='macro', zero_division=0)

  return {
      'macro_precision': precise_macro,
      'macro_recall': recall_macro,
      'macro_f1': f1_macro
  }

In [None]:
N_EPOCHS = 64
best_f1 = -1
for epoch in range(N_EPOCHS):
  train_epoch_loss = 0
  model.train()
  train_ite = tqdm(train_dataloader)
  for batch in train_ite:
    x, l = batch
    output = model(x)
    loss = loss_fn(output, l)
    model.zero_grad()
    loss.backward()
    optimizer.step()
    train_ite.set_postfix({'training loss': loss.item()})
    train_epoch_loss += loss.item()

  validation_epoch_loss = 0
  valid_ite = tqdm(valid_dataloader)
  preds = []
  golds = []
  model.eval()
  with torch.no_grad():
    for batch in valid_ite:
      x, y = batch
      pred_out = model(x)
      preds += pred_out.argmax(dim=-1).tolist()
      golds += y.tolist()
      val_loss = loss_fn(pred_out,y)
      valid_ite.set_postfix({'validation loss': val_loss.item()})
      validation_epoch_loss = val_loss.item()
    print()
    print('--'*20, ' Validation Scores ', '--'*20)
    precise_macro = precision_score(golds, preds, average='macro', zero_division=0)
    recall_macro = recall_score(golds, preds, average='macro', zero_division=0)
    f1_macro = f1_score(golds, preds, average='macro', zero_division=0)
    if f1_macro > best_f1:
      best_model = copy.deepcopy(model)
      best_f1 = f1_macro
    print(f'EPOCH: {epoch}')
    print(f'avg training loss: {(train_epoch_loss/len(train_dataset))}, avg validation loss:{(validation_epoch_loss/len(valid_dataset))}')
    print(f'precision:{precise_macro}, recall:{recall_macro}, f1:{f1_macro}')
    # if best_f1<f1_macro:
    # print('--'*50)



## TESTING

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=16)
eval_loop(best_model, test_dataloader)

100%|██████████| 10/10 [00:00<00:00, 19.11it/s]


{'macro_precision': 0.3600362823831331,
 'macro_recall': 0.3631226914245782,
 'macro_f1': 0.348537452885279}