## Task2 Bengali

In [1]:
#imports
from data import BengaliData
from model import BengaliLSTMClassifier
from train import train_model 
from eval import evaluate_test_set
import torch
import pickle
from config import config_dict
from torch import nn

In [2]:
## load vocab and embedding weights
data = BengaliData(config_dict['file_paths'])
with open(config_dict['file_paths']['embeddings_path'], 'rb') as f:
    embedding_weights = pickle.load(f)


INFO:root:loading and preprocessing data...
INFO:root:reading and preprocessing data completed...
INFO:root:creating vocabulary...
INFO:root:creating vocabulary completed...


In [3]:
## check whether the pre-trained embeddings are the same shape as of train vocabulary
assert embedding_weights.T.shape == (len(data.vocab), config_dict['embedding_size']), "Pre-trained embeddings size not equal to size of embedding layer"


In [4]:
## create model instance  with configurations coming from config file
model = BengaliLSTMClassifier(pretrained_state_dict_path= config_dict['file_paths']['pretrained_path'], batch_size=config_dict['batch_size'], output_size=config_dict['num_classes'], 
                            vocab_size=len(data.vocab), hidden_size=config_dict['hidden_size'], 
                            embedding_size=config_dict['embedding_size'], weights=torch.FloatTensor(embedding_weights.T),
                            lstm_layers=config_dict['lstm_layers'], device=config_dict['device']).to(config_dict['device'])



##  Loading pretrained LSTM & FC weights from Hindi Classifier

In [5]:
## load pretrained weights
model.load_pretrained_layers()

Loading pretrained weights...
Skipping the following layer(s): word_embeddings.weight


In [6]:

## get dataloaders for train and test set
bengali_dataloader = data.get_data_loader(batch_size=config_dict['batch_size'])

## filtering out embedding weights since they won't be optimized
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))



In [7]:
## training the model on train set
#model = train_model(model, optimizer, bengali_dataloader, data, max_epochs=config_dict['epochs'],config_dict=config_dict)

In [8]:
## loading the best model saved during training from disk
model.load_state_dict(torch.load('{}.pth'.format(config_dict['model_name']), map_location=torch.device(config_dict['device'])))
## evaluate model on test set
evaluate_test_set(model, data, bengali_dataloader, device=config_dict['device'])

INFO:root:Evaluating accuracy on test set


Test acc: 0.6943887775551102



                 precision    recall  f1-score   support

non hate speech       0.70      0.61      0.66       472
    hate speech       0.69      0.77      0.73       526

       accuracy                           0.69       998
      macro avg       0.70      0.69      0.69       998
   weighted avg       0.70      0.69      0.69       998

