In [22]:
import torch
import torch.nn as nn
from sentiment_preprocessing import SentimentAnalysisDataset, EmbeddingDataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
torch.random.manual_seed(13)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import sys
sys.path.append('..')
from ELMO import ELMo
from sentiment_models import SentimentAnalysis, SentimentAnalysis_WithoutELMo
import wandb
import pandas as pd

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
data = pd.read_csv('dataset/tamil_tweets.csv', encoding='utf-8')

In [30]:
# randomly split into train and val
train = data.sample(frac=0.8, random_state=13)
val = data.drop(train.index)

In [3]:
train = pd.read_csv('dataset/marathi_train_tweets.csv')
val = pd.read_csv('dataset/marathi_valid_tweets.csv')

In [31]:
train = train.dropna()
val = val.dropna()

In [33]:
# for marathi dataset
train_data = train.iloc[:, 0].values
train_labels = train.iloc[:, 1].values
val_data = val.iloc[:, 0].values
val_labels = val.iloc[:, 1].values

In [34]:
label_map = {label: i for i, label in enumerate(set(train_labels))}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

In [3]:
path = 'dataset/hindi_pos_train_movies.txt'
with open (path, "r", encoding='utf-8') as f:
    pos_train = f.readlines()
path = 'dataset/hindi_neg_train_movies.txt' 
with open (path, "r", encoding='utf-8') as f:
    neg_train = f.readlines()

# make a tensor of labels
y_pos = torch.ones(len(pos_train), dtype=torch.long)
y_neg = torch.zeros(len(neg_train), dtype=torch.long)
labels = torch.cat((y_pos, y_neg), dim=0)

data = pos_train + neg_train

In [35]:
char_vocab = torch.load('../ELMo/char_vocab_tamil.pt')
word_vocab = torch.load('../ELMo/word_vocab_tamil.pt')

In [36]:
elmo = ELMo(cnn_config = {'character_embedding_size': 16, 
                           'num_filters': 32, 
                           'kernel_size': 5, 
                           'max_word_length': 10, 
                           'char_vocab_size': char_vocab.num_chars}, 
             elmo_config = {'num_layers': 3,
                            'word_embedding_dim': 150,
                            'vocab_size': word_vocab.num_words}, 
             char_vocab_size = char_vocab.num_chars).to(device)

In [37]:
# load model
elmo.load_state_dict(torch.load('../ELMo/elmo_tamil.pt'))

<All keys matched successfully>

In [38]:
num_classes = len(set(train_labels))
num_classes

2

In [39]:
def train_sentiment(model, train_loader, val_loader, optimizer, criterion, epochs):
    model.train()
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        predictions = []
        targets = []
        predictions_val = []
        targets_val = []
        loss_val = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            sentences, labels = batch
            sentences = sentences.to(device)
            labels = labels.to(device)
            output = model(sentences)
            onehot_labels = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()
            loss = criterion(output, onehot_labels)
            loss.backward()
            optimizer.step()   
            total_loss += loss.item()
            predictions.extend(torch.argmax(output, dim=1).tolist())
            targets.extend(labels.tolist())
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                sentences, labels = batch
                sentences = sentences.to(device)
                labels = labels.to(device)
                output = model(sentences)
                onehot_labels = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()
                loss = criterion(output, onehot_labels)
                predictions_val.extend(torch.argmax(output, dim=1).tolist())
                targets_val.extend(labels.tolist())
                loss_val += loss.item()

        wandb.log({"Train Loss": total_loss/len(train_loader), 
                   "Train Accuracy": torch.sum(torch.tensor(predictions) == torch.tensor(targets)).item()/len(targets), 
                   "Val Accuracy": torch.sum(torch.tensor(predictions_val) == torch.tensor(targets_val)).item()/len(targets_val),
                   "Val Loss": loss_val/len(val_loader),
                   "Val F1": f1_score(targets_val, predictions_val, average='macro', zero_division=0),
                   "Val Precision": precision_score(targets_val, predictions_val, average='macro', zero_division=0),
                   "Val Recall": recall_score(targets_val, predictions_val, average='macro', zero_division=0)                     
                   })
        # print(f"Epoch {epoch+1}")
        # print(f"Train Loss: {total_loss/len(train_loader)}")
        # print(f"Train Accuracy: {torch.sum(torch.tensor(predictions) == torch.tensor(targets)).item()/len(targets)}")  
        # print(f"Val Accuracy: {torch.sum(torch.tensor(predictions_val) == torch.tensor(targets_val)).item()/len(targets_val)}")


In [None]:
# dataset = SentimentAnalysisDataset(data, labels, word_vocab, char_vocab)
# # split into train and val
# train_size = int(0.8 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [45]:
configs = [
    {'model': 'ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.00005},
    # {'model': 'ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.0001},
    {'model': 'ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.00005},
    # {'model': 'ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0001},
]

In [46]:
for config in configs:
    wandb.init(project='INLP-Project-ELMo', group="Sentiment Analysis", name="Tamil with ELMo", config=config)
    train_dataset = SentimentAnalysisDataset(train_data, train_labels, word_vocab, char_vocab)
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=train_dataset.collate_fn)
    val_dataset = SentimentAnalysisDataset(val_data, val_labels, word_vocab, char_vocab)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=val_dataset.collate_fn)
    sentiment_analysis = SentimentAnalysis(elmo, 300, num_classes)
    wandb.watch(sentiment_analysis)

    optimizer = torch.optim.Adam(sentiment_analysis.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss()
    train_sentiment(sentiment_analysis, train_dataloader, val_dataloader, optimizer, criterion, config['epochs'])
    wandb.join()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113573266710672, max=1.0…

100%|██████████| 26/26 [00:01<00:00, 20.88it/s]
100%|██████████| 26/26 [00:01<00:00, 23.21it/s]
100%|██████████| 26/26 [00:01<00:00, 23.12it/s]
100%|██████████| 26/26 [00:01<00:00, 23.13it/s]
100%|██████████| 26/26 [00:01<00:00, 22.94it/s]
100%|██████████| 26/26 [00:01<00:00, 21.78it/s]
100%|██████████| 26/26 [00:01<00:00, 24.12it/s]
100%|██████████| 26/26 [00:01<00:00, 22.96it/s]
100%|██████████| 26/26 [00:01<00:00, 23.00it/s]
100%|██████████| 26/26 [00:01<00:00, 22.18it/s]
100%|██████████| 26/26 [00:01<00:00, 20.40it/s]
100%|██████████| 26/26 [00:01<00:00, 22.88it/s]
100%|██████████| 26/26 [00:01<00:00, 22.76it/s]
100%|██████████| 26/26 [00:01<00:00, 19.13it/s]
100%|██████████| 26/26 [00:01<00:00, 23.17it/s]


VBox(children=(Label(value='0.005 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.4567343249239872, max=1.0…

0,1
Train Accuracy,▁▂▃▃▂▃▄▄▅▆▆▆▇██
Train Loss,████▇▇▇▆▆▅▄▄▃▂▁
Val Accuracy,▁▃▄▁▆▇▆▆▆▅▇▆▆▆█
Val F1,▁▃▄▁▆▇▇▆▆▄▇▆▆▆█
Val Loss,▅▅▄▄▂▂▂▂▄▁▄█▄▄▇
Val Precision,▁▃▁▂▅▇▆▅▇▃▇▇▅▅█
Val Recall,▁▃▁▁▅▇▆▆▆▃▇▆▅▅█

0,1
Train Accuracy,0.80443
Train Loss,0.42298
Val Accuracy,0.65347
Val F1,0.65179
Val Loss,0.70687
Val Precision,0.65235
Val Recall,0.65447


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113828744419152, max=1.0…

100%|██████████| 13/13 [00:00<00:00, 15.70it/s]
100%|██████████| 13/13 [00:00<00:00, 16.11it/s]
100%|██████████| 13/13 [00:00<00:00, 15.26it/s]
100%|██████████| 13/13 [00:00<00:00, 15.20it/s]
100%|██████████| 13/13 [00:00<00:00, 16.13it/s]
100%|██████████| 13/13 [00:00<00:00, 15.49it/s]
100%|██████████| 13/13 [00:00<00:00, 13.88it/s]
100%|██████████| 13/13 [00:00<00:00, 16.08it/s]
100%|██████████| 13/13 [00:00<00:00, 16.37it/s]
100%|██████████| 13/13 [00:00<00:00, 16.37it/s]
100%|██████████| 13/13 [00:00<00:00, 16.33it/s]
100%|██████████| 13/13 [00:00<00:00, 16.36it/s]
100%|██████████| 13/13 [00:00<00:00, 16.39it/s]
100%|██████████| 13/13 [00:00<00:00, 16.33it/s]
100%|██████████| 13/13 [00:00<00:00, 14.49it/s]


VBox(children=(Label(value='0.005 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.4568540433925049, max=1.0…

0,1
Train Accuracy,▁▃▄▅▅▅▅▆▆▇▇████
Train Loss,██▇▇▇▇▆▆▆▅▄▃▃▂▁
Val Accuracy,▂▁▅▆▅▆▆▇▇█▇▅█▅▇
Val F1,▂▁▆▆▆▆▆▇▇█▇▆█▄▆
Val Loss,█▇▇▆█▇▆▅▅▆▂▃▄▅▁
Val Precision,▁▁▅▅▄▄▅▆▆█▅▅█▃▅
Val Recall,▂▁▅▅▅▅▆▆▇█▆▅█▂▅

0,1
Train Accuracy,0.69619
Train Loss,0.57137
Val Accuracy,0.62376
Val F1,0.60125
Val Loss,0.62934
Val Precision,0.61752
Val Recall,0.60406


In [43]:
configs = [
    {'model': 'No ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0005},
    {'model': 'No ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.0001},
    {'model': 'No ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 64, 'lr': 0.0005},
    {'model': 'No ELMo', 'language': 'Tamil', 'dataset': 'Tweets', 'num_classes': num_classes, 'epochs': 15, 'batch_size': 32, 'lr': 0.0001},
]

In [None]:
dataset = EmbeddingDataset(data, labels, word_vocab)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [44]:
for config in configs:
    wandb.init(project='INLP-Project-ELMo', group="Sentiment Analysis", name="Tamil without ELMo", config=config)
    train_dataset = EmbeddingDataset(train_data, train_labels, word_vocab)
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=train_dataset.collate_fn)
    val_dataset = EmbeddingDataset(val_data, val_labels, word_vocab)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=val_dataset.collate_fn)
    sentiment_analysis = SentimentAnalysis_WithoutELMo(150, num_classes, word_vocab)
    wandb.watch(sentiment_analysis)

    optimizer = torch.optim.Adam(sentiment_analysis.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss()
    train_sentiment(sentiment_analysis, train_dataloader, val_dataloader, optimizer, criterion, config['epochs'])
    wandb.join()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112813277739204, max=1.0…

100%|██████████| 26/26 [00:00<00:00, 107.33it/s]
100%|██████████| 26/26 [00:00<00:00, 112.20it/s]
100%|██████████| 26/26 [00:00<00:00, 113.05it/s]
100%|██████████| 26/26 [00:00<00:00, 112.88it/s]
100%|██████████| 26/26 [00:00<00:00, 115.22it/s]
100%|██████████| 26/26 [00:00<00:00, 96.60it/s]
100%|██████████| 26/26 [00:00<00:00, 93.20it/s]
100%|██████████| 26/26 [00:00<00:00, 92.72it/s]
100%|██████████| 26/26 [00:00<00:00, 91.42it/s]
100%|██████████| 26/26 [00:00<00:00, 92.38it/s]
100%|██████████| 26/26 [00:00<00:00, 93.94it/s]
100%|██████████| 26/26 [00:00<00:00, 99.82it/s] 
100%|██████████| 26/26 [00:00<00:00, 92.14it/s]
100%|██████████| 26/26 [00:00<00:00, 89.37it/s]
100%|██████████| 26/26 [00:00<00:00, 91.41it/s]


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Accuracy,▁▂▁▂▂▂▂▃▄▅▆▆▇██
Train Loss,███████▇▇▅▄▄▂▁▁
Val Accuracy,▃▄▁▄▄▅▆▇▇▇▇█▇▆▇
Val F1,▃▃▁▄▅▃▅▆▇▆▇█▇▆▇
Val Loss,▇▇█▇▇▆▆▄▂▂▂▁▆▆█
Val Precision,▁▂▁▃▃▅▄▇▇▆▆█▇▅█
Val Recall,▂▂▁▃▃▃▄▅▇▆▆█▇▅█

0,1
Train Accuracy,0.78721
Train Loss,0.4537
Val Accuracy,0.64851
Val F1,0.6439
Val Loss,0.69983
Val Precision,0.69218
Val Recall,0.67152


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113109977789767, max=1.0…

100%|██████████| 13/13 [00:00<00:00, 63.41it/s]
100%|██████████| 13/13 [00:00<00:00, 64.08it/s]
100%|██████████| 13/13 [00:00<00:00, 66.96it/s]
100%|██████████| 13/13 [00:00<00:00, 68.29it/s]
100%|██████████| 13/13 [00:00<00:00, 67.36it/s]
100%|██████████| 13/13 [00:00<00:00, 68.46it/s]
100%|██████████| 13/13 [00:00<00:00, 66.90it/s]
100%|██████████| 13/13 [00:00<00:00, 67.37it/s]
100%|██████████| 13/13 [00:00<00:00, 59.44it/s]
100%|██████████| 13/13 [00:00<00:00, 64.58it/s]
100%|██████████| 13/13 [00:00<00:00, 67.53it/s]
100%|██████████| 13/13 [00:00<00:00, 66.57it/s]
100%|██████████| 13/13 [00:00<00:00, 66.53it/s]
100%|██████████| 13/13 [00:00<00:00, 66.91it/s]
100%|██████████| 13/13 [00:00<00:00, 68.36it/s]


VBox(children=(Label(value='0.005 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.4572744469117526, max=1.0…

0,1
Train Accuracy,▁▃▅▆▃▇▃▄▅▇▅▆▄▆█
Train Loss,█▆▅▄▅▃▄▄▂▃▂▂▃▂▁
Val Accuracy,▂▆▅▄▁▄▅▇▄▆▆▇▆▄█
Val F1,▃▆▅▃▁▃▆▇▄▇▆█▅▃█
Val Loss,▆▇▆▆█▅▃▅▆▅▁▄▅▃▁
Val Precision,▂█▆▃▁▅▅▅▃▇▄▆▄▇▆
Val Recall,▃█▇▃▁▅▆▆▄▇▅▆▄▇▆

0,1
Train Accuracy,0.58426
Train Loss,0.67949
Val Accuracy,0.55941
Val F1,0.54878
Val Loss,0.67154
Val Precision,0.55
Val Recall,0.54892


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113037944435038, max=1.0…

100%|██████████| 13/13 [00:00<00:00, 67.86it/s]
100%|██████████| 13/13 [00:00<00:00, 74.23it/s]
100%|██████████| 13/13 [00:00<00:00, 76.25it/s]
100%|██████████| 13/13 [00:00<00:00, 77.94it/s]
100%|██████████| 13/13 [00:00<00:00, 75.28it/s]
100%|██████████| 13/13 [00:00<00:00, 77.54it/s]
100%|██████████| 13/13 [00:00<00:00, 75.76it/s]
100%|██████████| 13/13 [00:00<00:00, 76.77it/s]
100%|██████████| 13/13 [00:00<00:00, 76.76it/s]
100%|██████████| 13/13 [00:00<00:00, 67.72it/s]
100%|██████████| 13/13 [00:00<00:00, 66.11it/s]
100%|██████████| 13/13 [00:00<00:00, 66.81it/s]
100%|██████████| 13/13 [00:00<00:00, 73.92it/s]
100%|██████████| 13/13 [00:00<00:00, 68.26it/s]
100%|██████████| 13/13 [00:00<00:00, 68.59it/s]


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Accuracy,▁▂▂▃▂▃▄▃▃▄▅▆▆▇█
Train Loss,███▇██▇▇▇▇▇▅▅▃▁
Val Accuracy,▂▁▄▃▄▂▃▂▆▄▅▅▇██
Val F1,▄▁▄▄▄▄▅▄▅▅▆▆██▆
Val Loss,▆█▆█▅▅▆▆▆▆▅▆▂▁▄
Val Precision,▁▂▂▃▂▁▂▂▄▃▃▇▅▇█
Val Recall,▁▁▂▃▁▁▂▃▄▄▄█▇█▆

0,1
Train Accuracy,0.70357
Train Loss,0.57068
Val Accuracy,0.63861
Val F1,0.57336
Val Loss,0.65124
Val Precision,0.67717
Val Recall,0.60063


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113320422252097, max=1.0…

100%|██████████| 26/26 [00:00<00:00, 104.90it/s]
100%|██████████| 26/26 [00:00<00:00, 116.27it/s]
100%|██████████| 26/26 [00:00<00:00, 117.03it/s]
100%|██████████| 26/26 [00:00<00:00, 92.12it/s]
100%|██████████| 26/26 [00:00<00:00, 80.80it/s]
100%|██████████| 26/26 [00:00<00:00, 89.88it/s]
100%|██████████| 26/26 [00:00<00:00, 105.59it/s]
100%|██████████| 26/26 [00:00<00:00, 90.10it/s]
100%|██████████| 26/26 [00:00<00:00, 93.06it/s]
100%|██████████| 26/26 [00:00<00:00, 91.74it/s]
100%|██████████| 26/26 [00:00<00:00, 87.33it/s]
100%|██████████| 26/26 [00:00<00:00, 77.15it/s]
100%|██████████| 26/26 [00:00<00:00, 83.13it/s]
100%|██████████| 26/26 [00:00<00:00, 93.88it/s]
100%|██████████| 26/26 [00:00<00:00, 96.63it/s]


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Accuracy,▁▅▃▅▅▆▄▆▄▆▇▇▅█▇
Train Loss,█▆▆▆▄▃▅▄▄▄▄▄▃▂▁
Val Accuracy,▅▁▅▄▅▄▃▅▃▅▄▄█▅▆
Val F1,▃▁▅▄▅▅▃▅▃▅▄▄█▆▆
Val Loss,▆▇▆▆▇▇█▅▅█▄▃▁▂▃
Val Precision,▄▁▆▇▅▅▅▅▃▅▄▅█▆▆
Val Recall,▃▁▆▆▅▅▅▅▃▅▄▅█▆▆

0,1
Train Accuracy,0.57934
Train Loss,0.67496
Val Accuracy,0.54455
Val F1,0.54005
Val Loss,0.68091
Val Precision,0.5401
Val Recall,0.54042
