In [1]:
!pip install transformers -q
!pip install sentencepiece -q

In [2]:
import os
import shutil

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

from official.nlp import optimization

import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer, AdamW

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

device = torch.device("cuda")
cpu = torch.device("cpu")

In [3]:
data = pd.read_json('/content/drive/MyDrive/mp3_data_w_vectors.json')
#data = pd.read_json('https://raw.githubusercontent.com/pradeep-miriyala/multi-modal-bert-models/main/data/song_lyric_map.json?token=ADXRNFRS46PTRG46WUZLXHDBKH7HY')
data['iGenre'] = data.apply(lambda x:int(x.Genre=='Devotional'),axis=1)

In [4]:
indic_model = AutoModel.from_pretrained("ai4bharat/indic-bert")

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.decoder.weight', 'sop_classifier.classifier.weight', 'sop_classifier.classifier.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [6]:
txt = list(data.apply(lambda x:x.Lyric,axis=1))

In [7]:
sent_id = tokenizer.batch_encode_plus(txt, padding=True, return_token_type_ids=False)

In [8]:
max_seq_len = 25
all_tokens = tokenizer.batch_encode_plus(txt, max_length=max_seq_len,padding='longest', truncation=True, return_token_type_ids=False)
all_seq = torch.tensor(all_tokens['input_ids'])
all_mask = torch.tensor(all_tokens['attention_mask'])
all_y = torch.tensor(data['iGenre'].tolist())

In [9]:
def get_data_loader(seq, mask, y, mfcc_data=None,batch_size = 16):
  data = TensorDataset(seq, mask, mfcc_data, y)
  sampler = RandomSampler(data)
  dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
  return (data,sampler,dataloader)

In [10]:
for param in indic_model.parameters():
    param.requires_grad = True

In [11]:
class BERT_Arch(nn.Module):
    def __init__(self, bert, fusion=False):      
      super(BERT_Arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fusion = fusion
      self.fc1 = nn.Linear(768,512)
      if self.fusion:
        self.fc2 = nn.Linear(512,512)
        self.fca1 = nn.Linear(14,64)
        self.fca2 = nn.Linear(64,128)
        self.fusion1 = nn.Linear(640,512) # 512 + 128
        self.fusion2 = nn.Linear(512,2)
      else:
        self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask, mfcc_data):
      _, cls_hs = self.bert(sent_id, attention_mask=mask,return_dict=False)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      if self.fusion:
        a1 = self.fca1(mfcc_data)
        a1 = self.relu(a1)
        a1 = self.dropout(a1)
        a1 = self.fca2(a1)
        x = self.relu(x) # Activation for output from text features
        x = torch.cat((x,a1),dim=1) # Fusion Layer
        x = self.fusion1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fusion2(x)
      x = self.softmax(x)
      return x

In [17]:
def train(model,train_dataloader,loss_fcn,optimizer):
  model.train()
  total_loss, total_accuracy = 0, 0
  # empty list to save model predictions
  total_preds=[]  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):    
    if step % 20 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    # push the batch to gpu
    batch = [r.to(device) for r in batch]
    sent_id, mask, mfcc_means, labels = batch
    # clear previously calculated gradients 
    model.zero_grad()
    # get model predictions for the current batch
    preds = model(sent_id, mask, mfcc_means)
    # compute the loss between actual and predicted values
    loss = loss_fcn(preds, labels)
    # add on to the total loss
    total_loss = total_loss + loss.item()
    # backward pass to calculate the gradients
    loss.backward()
    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # update parameters
    optimizer.step()
    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()
    # append the model predictions
    total_preds.append(preds)
    del batch
  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)
  #returns the loss and predictions
  return avg_loss, total_preds

In [18]:
def evaluate(model,val_dataloader,loss_fcn):
  print("\nEvaluating...")
  # deactivate dropout layers
  model.eval()
  total_loss, total_accuracy = 0, 0  
  # empty list to save the model predictions
  total_preds = []
  # iterate over batches
  for step,batch in enumerate(val_dataloader):    
    if step % 20 == 0 and not step == 0:      
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
    # push the batch to gpu
    batch = [t.to(device) for t in batch]
    sent_id, mask, mfcc_means, labels = batch
    preds = model(sent_id, mask, mfcc_means)
    # compute the validation loss between actual and predicted values
    loss = loss_fcn(preds,labels)
    total_loss = total_loss + loss.item()
    preds = preds.detach().cpu().numpy()
    total_preds.append(preds)
    del batch
  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)
  return avg_loss, total_preds

In [20]:
fusion = True
k_folds = 5
# number of training epochs
epochs = 5
torch.manual_seed(42)
def run_models(fusion):
  kfold = KFold(n_splits=k_folds, shuffle=True,random_state=42)
  models = [BERT_Arch(indic_model,fusion) for x in range(k_folds)]
  for fold, (train_ids, test_ids) in enumerate(kfold.split(data)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    train_mfcc = torch.tensor([[_ for _ in data['mfcc_mean'].iloc[x]] for x in train_ids])
    test_mfcc = torch.tensor([[_ for _ in data['mfcc_mean'].iloc[x]] for x in test_ids])
    train_data, train_sampler, train_dataloader = get_data_loader(all_seq[train_ids],all_mask[train_ids],all_y[train_ids],train_mfcc)
    test_data, test_sampler, test_dataloader = get_data_loader(all_seq[test_ids],all_mask[test_ids],all_y[test_ids],test_mfcc)
    best_valid_loss = float('inf')
    models[fold].to(device)
    class_wts = compute_class_weight('balanced', np.unique(all_y[train_ids].tolist()), all_y[train_ids].tolist())
    print(class_wts)
    # convert class weights to tensor
    weights= torch.tensor(class_wts,dtype=torch.float)
    weights = weights.to(device)
    # loss function
    loss_fcn  = nn.NLLLoss(weight=weights)
    # empty lists to store training and validation loss of each epoch
    train_losses=[]
    valid_losses=[]
    # define the optimizer
    optimizer = AdamW(models[fold].parameters(), lr = 1e-5)
    #for each epoch
    for epoch in range(epochs):     
        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
        #train model
        train_loss, _ = train(models[fold],train_dataloader,loss_fcn,optimizer)    
        #evaluate model
        valid_loss, _ = evaluate(models[fold],test_dataloader,loss_fcn)    
        #save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(models[fold].state_dict(), 'saved_weights.pt')      
        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)    
        print(f'\nTraining Loss: {train_loss:.3f}')
        print(f'Validation Loss: {valid_loss:.3f}')
        torch.cuda.empty_cache()
    models[fold].load_state_dict(torch.load('saved_weights.pt'))    
    preds = models[fold](all_seq[test_ids].to(device), all_mask[test_ids].to(device), test_mfcc.to(device))
    preds = preds.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    print('Test')
    print(classification_report(all_y[test_ids], preds))
    print(pd.crosstab(all_y[test_ids], preds))

In [21]:
# Fusion Model
run_models(True)

FOLD 0
--------------------------------
[1.47147651 0.75734024]

 Epoch 1 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 2 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.695
Validation Loss: 0.693

 Epoch 3 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.692

 Epoch 4 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.691

 Epoch 5 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.676
Validation Loss: 0.657
Test
              precision    recall  f1-score   support

           0       0.58      0.51      0.54        72
           1       0.78      0.82      0.80       148

    accuracy                           0.72       220
   macro avg       0.68      0.67      0.67       220
weighted avg       0.71      

In [22]:
# Text Only model
run_models(False)

FOLD 0
--------------------------------
[1.47147651 0.75734024]

 Epoch 1 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.482
Validation Loss: 0.347

 Epoch 2 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.320
Validation Loss: 0.316

 Epoch 3 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.260
Validation Loss: 0.346

 Epoch 4 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.235
Validation Loss: 0.376

 Epoch 5 / 5
  Batch    20  of     55.
  Batch    40  of     55.

Evaluating...

Training Loss: 0.197
Validation Loss: 0.374
Test
              precision    recall  f1-score   support

           0       0.84      0.89      0.86        72
           1       0.94      0.92      0.93       148

    accuracy                           0.91       220
   macro avg       0.89      0.90      0.90       220
weighted avg       0.91      