In [1]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt

gpu = torch.device('cuda')
cpu = torch.device('cpu')

In [2]:
#data = pd.read_json('/content/drive/MyDrive/song_lyric_map.json')
data = pd.read_json('../input/annamayya-song-lyrical-map/song_lyric_map.json')
data['iGenre'] = data.apply(lambda x:int(x.Genre=='Devotional'),axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['Lyric'],data['iGenre'],random_state=42, test_size=0.3)

In [4]:
def build_word_tokenizer(sentences, word_threshold=1):
    word_sequences = [[] for _ in sentences]
    # Dictionary to create word to frequency
    word_counter = Counter()
    for i, sentence in enumerate(sentences):
        for word in sentence.split():
            word_counter.update([word])
            word_sequences[i].append(word)
    word_counter = {k: v for k, v in word_counter.items() if v > word_threshold}
    word_counter = sorted(word_counter, key=word_counter.get, reverse=True)
    word2idx = defaultdict(int)
    idx2word = defaultdict(str)
    for i, word in enumerate(word_counter):
        word2idx[word] = i
        idx2word[i] = word
    return word_counter, word_sequences, word2idx, idx2word


def pad_input(sent_sequence, seq_len):
    features = np.zeros((len(sent_sequence), seq_len), dtype=int)
    for i, sentence in enumerate(sent_sequence):
        if len(sentence) != 0:
            features[i, -len(sentence):] = np.array(sentence)[:seq_len]
    return features


def tokenize(sentences, word2idx, seq_len=None):
    token_matrix = [[] for _ in sentences]
    for i, sentence in enumerate(sentences):
        token_matrix[i] = [word2idx[word] for word in sentence.split()]
    if seq_len:
      token_matrix = pad_input(token_matrix, seq_len)
    return token_matrix

In [5]:
word_counter, word_sequences, word2idx, idx2word = build_word_tokenizer(X_train, 5)

In [100]:
seq_len = 300
train_tokens = tokenize(X_train, word2idx, seq_len=seq_len)
test_tokens = tokenize(X_test, word2idx, seq_len=seq_len)

In [101]:
train_data = TensorDataset(torch.from_numpy(train_tokens), torch.from_numpy(y_train.to_numpy()))
test_data = TensorDataset(torch.from_numpy(test_tokens), torch.from_numpy(y_test.to_numpy()))

In [58]:
def plot_results(results, model_name):
    plt.figure(figsize=[20, 5])
    epochs = len(results[0]['train_precision'])
    x_label = f'{len(results)} Fold and {epochs} Epochs'
    legend_labels = ['Train', 'Validation']

    def subplot_routine(key1, key2, title):
        plt.plot([x for k in results for x in results[k][key1]])
        plt.plot([x for k in results for x in results[k][key2]])
        plt.grid()
        plt.xlabel(x_label)
        plt.title(title)
        plt.legend(legend_labels)
        plt.ylim([0, 1.1])

    plt.subplot(1, 3, 1)
    subplot_routine('train_precision', 'validation_precision', 'Precision')
    plt.subplot(1, 3, 2)
    subplot_routine('train_recall', 'validation_recall', 'Recall')
    plt.subplot(1, 3, 3)
    subplot_routine('train_f1', 'validation_f1', 'F1')
    plt.suptitle(f'Metrics for {model_name}')
    plt.show()

In [102]:
# Ref : https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/ 
class LstmModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, drop_prob=0.1):
        super(LstmModel, self).__init__()
        self.n_layers = 6
        self.hidden_dim = hidden_dim        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, self.n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, 2)
        self.softmax = nn.LogSoftmax(dim=0)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        out = self.dropout(lstm_out)
        out = self.fc(out[:,-1,:])
        out = torch.relu_(out)
        out = self.softmax(out)
        out = out.view(batch_size, -1)
        return out, hidden
    
    def init_hidden(self, batch_size, target_device):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(target_device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(target_device))
        return hidden

In [103]:
def run_model(model, data_loader, loss_fcn, optimizer, target_device, is_training, clip_at=10):
    if is_training:
        model.train()
    else:
        model.eval()
    model.to(target_device)
    total_loss, total_accuracy = 0, 0
    # empty list to save model predictions
    model_predictions, model_labels = [], []
    # iterate over batches
    for step, batch in enumerate(data_loader):
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(data_loader)))
        # push the batch to gpu
        batch = [r.to(target_device) for r in batch]
        sent_vectors, labels = batch
        h = model.init_hidden(len(labels), target_device)
        if is_training:
            model.zero_grad()  # clear previously calculated gradients
            # get model predictions for the current batch
            predictions, tmp1 = model(sent_vectors, h)
            del tmp1
        else:
            with torch.no_grad(): 
                predictions, tmp1 = model(sent_vectors, h)
                del tmp1        
        loss = loss_fcn(predictions, labels)
        # add on to the total loss
        total_loss = total_loss + loss.item()
        if is_training:
            loss.backward()  # backward pass to calculate the gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_at)
            # update parameters
            optimizer.step()
        predictions = predictions.detach().cpu().numpy()
        # append the model predictions
        model_predictions.append(predictions)
        model_labels.append(labels.detach().cpu().numpy())
        del batch
    # compute the training loss of the epoch
    avg_loss = total_loss / len(data_loader)
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    model_predictions = np.concatenate(model_predictions, axis=0)
    model_labels = np.concatenate(model_labels, axis=0)
    model_predictions = np.argmax(model_predictions, axis=1)
    return avg_loss, model_predictions, model_labels, model, optimizer

In [109]:
torch.manual_seed(42)
vocab_size = len(word2idx) + 1
embedding_dim = 256
hidden_dim = 128
lr = 3.5e-5
results = {x: {} for x in range(1)}
fold = 0
batch_size = 16
epochs = 15

loss_fcn = nn.NLLLoss()
best_valid_loss = float('inf')
model = LstmModel(vocab_size, embedding_dim, hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
best_train_predictions, best_test_predictions, best_train_labels, best_test_labels = [], [], [], []
results[fold]['train_precision'] = []
results[fold]['train_recall'] = []
results[fold]['train_f1'] = []
results[fold]['validation_precision'] = []
results[fold]['validation_recall'] = []
results[fold]['validation_f1'] = []
train_losses, valid_losses = [], []
train_data_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_data_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    # train model
    train_loss, train_predictions, train_labels, model, optimizer = run_model(model, train_data_loader, loss_fcn, optimizer,
                                                            gpu, True)
    # evaluate model
    valid_loss, test_predictions, test_labels, model, optimizer = run_model(model, test_data_loader, loss_fcn, optimizer,
                                                          gpu, False)
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_train_predictions = train_predictions
        best_test_predictions = test_predictions
        best_train_labels = train_labels
        best_test_labels = test_labels
        torch.save(model.state_dict(), f'saved_weights_Fold{fold}.pt')
        # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'Losses - Train:{train_loss:.3f} / Validation:{valid_loss:.3f}')
    results[fold]['train_precision'].append(precision_score(train_labels, train_predictions))
    results[fold]['train_recall'].append(recall_score(train_labels, train_predictions))
    results[fold]['train_f1'].append(f1_score(train_labels, train_predictions))
    results[fold]['validation_precision'].append(precision_score(test_labels, test_predictions))
    results[fold]['validation_recall'].append(recall_score(test_labels, test_predictions))
    results[fold]['validation_f1'].append(f1_score(test_labels, test_predictions))
    torch.cuda.empty_cache()
print('On Train Data')
print(classification_report(best_train_labels, best_train_predictions))
print('On Test Data')
print(classification_report(best_test_labels, best_test_predictions))
results[fold]['train_losses'] = train_losses
results[fold]['validation_losses'] = valid_losses

In [110]:
plot_results(results, 'LSTM Text Classification')