## Task3: Hindi/Bengali BiLSTM Joint Dual Input Learning Self Attention with Transfer Learning Binary Classifier

In [1]:
from bengali_data import BengaliData
from hindi_data import HASOCData
from model import SentimentNet
from train import train_model 
from hindi_eval import evaluate_hindi_test_set
from bengali_eval import evaluate_bengali_test_set
import torch
import pickle
from config import config_dict
from torch import nn

In [2]:

bengali_data = BengaliData(config_dict['bengali_file_paths'])
hindi_data = HASOCData(config_dict['hindi_file_paths'])
with open(config_dict['bengali_file_paths']['embeddings_path'], 'rb') as f:
    bengali_embedding_weights = pickle.load(f)

with open(config_dict['hindi_file_paths']['embeddings_path'], 'rb') as f:
    hindi_embedding_weights = pickle.load(f)

## check whether the pre-trained embeddings are the same shape as of train vocabulary
assert bengali_embedding_weights.T.shape == (len(bengali_data.vocab), config_dict['embedding_size']), "Pre-trained Bengali embeddings size not equal to size of embedding layer"
assert hindi_embedding_weights.T.shape == (len(hindi_data.vocab), config_dict['embedding_size']), "Pre-trained Hindi embeddings size not equal to size of embedding layer"

## create model instance  with configurations coming from config file
model = SentimentNet(batch_size=config_dict['batch_size'], output_size=config_dict['num_classes'], 
                        bengali_vocab_size=len(bengali_data.vocab), hidden_size=config_dict['hidden_size'], 
                        embedding_size=config_dict['embedding_size'], hindi_weights=torch.FloatTensor(hindi_embedding_weights.T), bengali_weights=torch.FloatTensor(bengali_embedding_weights.T),
                        lstm_layers=config_dict['lstm_layers'], device=config_dict['device'], hindi_vocab_size=len(hindi_data.vocab),
                        bidirectional=config_dict['is_bi_lstm'], pretrained_path=config_dict['bengali_file_paths']['pretrained_path'],
                        self_attention_config=config_dict['self_attention_config'], fc_hidden_size=config_dict['fc_hidden_size']).to(config_dict['device'])

## load pre_trained layers
if config_dict['pretraining']:
    model.load_pretrained_layers()

## get dataloaders for train and test set
bengali_dataloader = bengali_data.get_data_loader(batch_size=config_dict['batch_size'])
hindi_dataloader = hindi_data.get_data_loader(batch_size=config_dict['batch_size'])

## filtering out embedding weights since they won't be optimized
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))



## training the model on train set
"""train_model(model, optimizer, bengali_dataloader=bengali_dataloader, hindi_dataloader=hindi_dataloader, 
            hindi_data=hindi_data, bengali_data=bengali_data, max_epochs=config_dict['epochs'], config_dict=config_dict)
"""



INFO:root:loading and preprocessing data...
INFO:root:reading and preprocessing data completed...
INFO:root:creating vocabulary...
INFO:root:creating vocabulary completed...
INFO:root:loading and preprocessing data...
INFO:root:reading and preprocessing data completed...
INFO:root:creating vocabulary...
INFO:root:creating vocabulary completed...


artefacts/pre_trained_hindi/hindi_classifier_attention_h32_l8_p0_0_r10.pth
Loading pretrained weights...
Skipping the following layer(s): word_embeddings.weight
Skipping the following layer(s): self_attention.layer1.weight
Skipping the following layer(s): self_attention.layer2.weight


"train_model(model, optimizer, bengali_dataloader=bengali_dataloader, hindi_dataloader=hindi_dataloader, \n            hindi_data=hindi_data, bengali_data=bengali_data, max_epochs=config_dict['epochs'], config_dict=config_dict)\n"

In [3]:
## evaluate bengali model on test set
evaluate_bengali_test_set(model, config_dict['bengali_model_name'], bengali_data, bengali_dataloader, device=config_dict['device'])

INFO:root:Evaluating accuracy on Bengali test set


best bengali model loaded...
Bengali Test acc: 0.7344689378757515
Bengali Classification Report


                 precision    recall  f1-score   support

non hate speech       0.69      0.79      0.74       472
    hate speech       0.78      0.69      0.73       526

       accuracy                           0.73       998
      macro avg       0.74      0.74      0.73       998
   weighted avg       0.74      0.73      0.73       998



In [4]:
## evaluate hindi model on test set
evaluate_hindi_test_set(model, config_dict['hindi_model_name'], hindi_data, hindi_dataloader, device=config_dict['device'])

INFO:root:Evaluating accuracy on Hindi test set


best hindi model loaded...
Hindi Test acc: 0.7457081545064378
Hindi Classification Report


                 precision    recall  f1-score   support

non hate speech       0.71      0.78      0.74       435
    hate speech       0.79      0.72      0.75       497

       accuracy                           0.75       932
      macro avg       0.75      0.75      0.75       932
   weighted avg       0.75      0.75      0.75       932

