In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm import tqdm
torch.manual_seed(13)
from news_classification_models import NewsClassification, NewsClassification_WithoutELMo
from news_classification_preprocessing import NewsClassificationDataset, EmbeddingDataset, WordLevelVocab
from sklearn.metrics import f1_score, precision_score, recall_score
import sys
sys.path.append('..')
from ELMO import ELMo
import wandb


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
train = pd.read_csv('hindi-train.csv', sep="\t", encoding='utf-8', header=None)
val = pd.read_csv('hindi-test.csv', sep="\t", encoding='utf-8', header=None)

In [4]:
# remove lines where values are nan
train = train.dropna()
val = val.dropna()

In [5]:
# load data (csv)
# train = pd.read_csv('indicnlp-news-articles/ta/ta-train.csv')
# test = pd.read_csv('indicnlp-news-articles/ta/ta-test.csv')
# val = pd.read_csv('indicnlp-news-articles/ta/ta-valid.csv')

In [6]:
# the data is the second column, labels are the first
train_data = train.iloc[:, 1].values
train_labels = train.iloc[:, 0].values
# test_data = test.iloc[:, 1].values
# test_labels = test.iloc[:, 0].values
val_data = val.iloc[:, 1].values
val_labels = val.iloc[:, 0].values

In [7]:
# conver the labels into values
label_map = {label: i for i, label in enumerate(set(train_labels))}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

In [8]:
char_vocab = torch.load('../ELMo/char_vocab_hindi.pt')
word_vocab = torch.load('../ELMo/word_vocab_hindi.pt')

In [9]:
elmo = ELMo(cnn_config = {'character_embedding_size': 16, 
                           'num_filters': 32, 
                           'kernel_size': 5, 
                           'max_word_length': 10, 
                           'char_vocab_size': char_vocab.num_chars}, 
             elmo_config = {'num_layers': 3,
                            'word_embedding_dim': 150,
                            'vocab_size': word_vocab.num_words}, 
             char_vocab_size = char_vocab.num_chars).to(device)

In [10]:
# load model
elmo.load_state_dict(torch.load('../ELMo/elmo_hindi.pt'))

<All keys matched successfully>

In [11]:
num_classes = len(set(train_labels))
num_classes

14

In [12]:
def train_news(model, train_loader, val_loader, optimizer, criterion, epochs):
    model.train()
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        predictions = []
        targets = []
        predictions_val = []
        targets_val = []
        loss_val = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            sentences, labels = batch
            sentences = sentences.to(device)
            labels = labels.to(device)
            output = model(sentences)
            onehot_labels = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()
            loss = criterion(output, onehot_labels)
            loss.backward()
            optimizer.step()   
            total_loss += loss.item()
            predictions.extend(torch.argmax(output, dim=1).tolist())
            targets.extend(labels.tolist())
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                sentences, labels = batch
                sentences = sentences.to(device)
                labels = labels.to(device)
                output = model(sentences)
                onehot_labels = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()
                loss = criterion(output, onehot_labels)
                predictions_val.extend(torch.argmax(output, dim=1).tolist())
                targets_val.extend(labels.tolist())
                loss_val += loss.item()

        wandb.log({"Train Loss": total_loss/len(train_loader), 
                   "Train Accuracy": torch.sum(torch.tensor(predictions) == torch.tensor(targets)).item()/len(targets), 
                   "Val Accuracy": torch.sum(torch.tensor(predictions_val) == torch.tensor(targets_val)).item()/len(targets_val),
                   "Val Loss": loss_val/len(val_loader),
                   "Val F1": f1_score(targets_val, predictions_val, average='macro', zero_division=0),
                   "Val Precision": precision_score(targets_val, predictions_val, average='macro', zero_division=0),
                   "Val Recall": recall_score(targets_val, predictions_val, average='macro', zero_division=0)                     
                   })
        print(f"Epoch {epoch+1}")
        # print(f"Train Loss: {total_loss/len(train_loader)}")
        # print(f"Train Accuracy: {torch.sum(torch.tensor(predictions) == torch.tensor(targets)).item()/len(targets)}")  
        # print(f"Val Accuracy: {torch.sum(torch.tensor(predictions_val) == torch.tensor(targets_val)).item()/len(targets_val)}")
        # print(f"Val Loss: {loss_val/len(val_loader)}")


In [13]:
configs = [
    {'model': 'ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0005},
    # {'model': 'ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.0001},
    # {'model': 'ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.00005},
    {'model': 'ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.001}
]

In [28]:
for config in configs:
    wandb.init(project='INLP-Project-ELMo', group='News Classification', name='Hindi with ELMo', config=config)

    dataset = NewsClassificationDataset(train_data, train_labels, char_vocab)
    train_loader = DataLoader(dataset, batch_size=config['batch_size'], collate_fn=dataset.collate_fn)
    val_dataset = NewsClassificationDataset(val_data, val_labels, char_vocab)
    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=val_dataset.collate_fn)
    news_classification = NewsClassification(elmo, 300, num_classes)
    wandb.watch(news_classification)

    optimizer = torch.optim.Adam(news_classification.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss()
    train_news(news_classification, train_loader, val_loader, optimizer, criterion, config['epochs'])
    wandb.join()

100%|██████████| 109/109 [00:23<00:00,  4.73it/s]


Epoch 1


100%|██████████| 109/109 [00:22<00:00,  4.93it/s]


Epoch 2


100%|██████████| 109/109 [00:21<00:00,  5.05it/s]


Epoch 3


100%|██████████| 109/109 [00:21<00:00,  4.99it/s]


Epoch 4


100%|██████████| 109/109 [00:21<00:00,  5.05it/s]


Epoch 5


100%|██████████| 109/109 [00:20<00:00,  5.27it/s]


Epoch 6


100%|██████████| 109/109 [00:21<00:00,  4.99it/s]


Epoch 7


100%|██████████| 109/109 [00:21<00:00,  5.17it/s]


Epoch 8


100%|██████████| 109/109 [00:21<00:00,  4.98it/s]


Epoch 9


100%|██████████| 109/109 [00:23<00:00,  4.60it/s]


Epoch 10


100%|██████████| 109/109 [00:22<00:00,  4.83it/s]


Epoch 11


100%|██████████| 109/109 [00:22<00:00,  4.80it/s]


Epoch 12


100%|██████████| 109/109 [00:23<00:00,  4.65it/s]


Epoch 13


100%|██████████| 109/109 [00:23<00:00,  4.57it/s]


Epoch 14


100%|██████████| 109/109 [00:24<00:00,  4.54it/s]


Epoch 15


[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113879726164871, max=1.0…

In [14]:
configs = [
    # {'model': 'No ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.00005},
    # {'model': 'No ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.0001},
    # {'model': 'No ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.00005},
    {'model': 'No ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0001},
    {'model': 'No ELMo', 'language': 'Hindi', 'dataset': 'BBC Hindi News', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0005}
]

In [15]:
for config in configs:
    wandb.init(project='INLP-Project-ELMo', group='News Classification', name='Hindi without ELMo', config=config)
    dataset = EmbeddingDataset(train_data, train_labels, word_vocab)
    train_loader = DataLoader(dataset, batch_size=config['batch_size'], collate_fn=dataset.collate_fn)
    val_dataset = EmbeddingDataset(val_data, val_labels, word_vocab)
    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=val_dataset.collate_fn)
    news_classification = NewsClassification_WithoutELMo(150, num_classes, word_vocab)
    wandb.watch(news_classification)

    optimizer = torch.optim.Adam(news_classification.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss()
    train_news(news_classification, train_loader, val_loader, optimizer, criterion, config['epochs'])
    wandb.join()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msanika-damle[0m ([33mproject-ai-scream[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 109/109 [00:12<00:00,  8.78it/s]


Epoch 1


100%|██████████| 109/109 [00:12<00:00,  8.43it/s]


Epoch 2


100%|██████████| 109/109 [00:12<00:00,  8.91it/s]


Epoch 3


100%|██████████| 109/109 [00:12<00:00,  8.79it/s]


Epoch 4


100%|██████████| 109/109 [00:12<00:00,  8.79it/s]


Epoch 5


100%|██████████| 109/109 [00:12<00:00,  8.65it/s]


Epoch 6


100%|██████████| 109/109 [00:12<00:00,  8.77it/s]


Epoch 7


100%|██████████| 109/109 [00:12<00:00,  8.83it/s]


Epoch 8


100%|██████████| 109/109 [00:13<00:00,  8.37it/s]


Epoch 9


100%|██████████| 109/109 [00:12<00:00,  8.64it/s]


Epoch 10


100%|██████████| 109/109 [00:12<00:00,  8.73it/s]


Epoch 11


100%|██████████| 109/109 [00:12<00:00,  8.53it/s]


Epoch 12


100%|██████████| 109/109 [00:12<00:00,  9.05it/s]


Epoch 13


100%|██████████| 109/109 [00:12<00:00,  8.70it/s]


Epoch 14


100%|██████████| 109/109 [00:13<00:00,  8.17it/s]


Epoch 15




VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Accuracy,▁▅▅▅▅▅▅▅▅▅▅▅▇▆█
Train Loss,█▃▃▂▂▂▂▂▂▂▂▂▁▂▁
Val Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁█▁▅
Val F1,▁▁▁▁▁▁▁▁▁▁▁▁█▁▅
Val Loss,█▄▄▄▄▃▃▃▄▃▃▂▂▃▁
Val Precision,▁▁▁▁▁▁▁▁▁▁▁██▁▆
Val Recall,▁▁▁▁▁▁▁▁▁▁▁▁█▁▅

0,1
Train Accuracy,0.42573
Train Loss,1.60443
Val Accuracy,0.42725
Val F1,0.05473
Val Loss,1.55942
Val Precision,0.04791
Val Recall,0.07718


100%|██████████| 109/109 [00:12<00:00,  9.00it/s]


Epoch 1


100%|██████████| 109/109 [00:12<00:00,  8.59it/s]


Epoch 2


100%|██████████| 109/109 [00:11<00:00,  9.09it/s]


Epoch 3


100%|██████████| 109/109 [00:13<00:00,  8.29it/s]


Epoch 4


100%|██████████| 109/109 [00:12<00:00,  8.56it/s]


Epoch 5


100%|██████████| 109/109 [00:12<00:00,  8.75it/s]


Epoch 6


100%|██████████| 109/109 [00:13<00:00,  8.37it/s]


Epoch 7


100%|██████████| 109/109 [00:12<00:00,  8.65it/s]


Epoch 8


100%|██████████| 109/109 [00:12<00:00,  9.05it/s]


Epoch 9


100%|██████████| 109/109 [00:12<00:00,  8.42it/s]


Epoch 10


100%|██████████| 109/109 [00:12<00:00,  8.49it/s]


Epoch 11


100%|██████████| 109/109 [00:13<00:00,  8.27it/s]


Epoch 12


100%|██████████| 109/109 [00:12<00:00,  8.62it/s]


Epoch 13


100%|██████████| 109/109 [00:12<00:00,  8.59it/s]


Epoch 14


100%|██████████| 109/109 [00:12<00:00,  8.96it/s]


Epoch 15




VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Accuracy,▁▁▁▂▃▃▄▅▅▅▆▇▇██
Train Loss,█▇▇▇▆▆▅▄▄▄▃▃▂▂▁
Val Accuracy,▁▁▂▃▄▅▅▅▅▅▆▇███
Val F1,▁▁▂▃▃▃▃▃▃▄▇▆▇██
Val Loss,██▆▇▅▄▃▃▃▃▃▂▁▁▂
Val Precision,▁▁▂▂▂▃▃▂▃▄█▅▆▇▇
Val Recall,▁▁▂▂▂▃▃▃▃▃▆▇▇██

0,1
Train Accuracy,0.78569
Train Loss,0.72424
Val Accuracy,0.65589
Val F1,0.21352
Val Loss,1.24822
Val Precision,0.20644
Val Recall,0.23348
