In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
from ELMO import ELMo
import torch.nn.functional as F
from sts_loader import STSDataset, wo_ELMO_Dataset
from torch.utils.data import DataLoader



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
print(device)

cuda


In [3]:
# Load vocabularies
vocab = torch.load('../hin_word_vocab.pt')
character_vocab = torch.load('../hin_char_vocab.pt')

In [4]:
# Get Sentences and Scores

path = '../sts-train-hi.tsv'
sts_dataset = STSDataset(path)
s1, s2, scores = sts_dataset.format(char_vocab=character_vocab)
scores = torch.tensor(scores, dtype=torch.float32)

  scores = torch.tensor(scores, dtype=torch.float32)


In [5]:
def create_dataloader(s1, s2, scores, batch_size):
    zipped = list(zip(s1, s2, scores))
    dataloader = DataLoader(zipped, batch_size=batch_size, shuffle=True)
    return dataloader

In [6]:
# make the dataloader
s1_val = s1[:1000]
s2_val = s2[:1000]
scores_val = scores[:1000]
batch_size = 64
val_dataloader = create_dataloader(s1_val, s2_val, scores_val, batch_size)
train_dataloader = create_dataloader(s1[1000:], s2[1000:], scores[1000:], batch_size)

In [7]:
# make another dataloader for the model without elmo using wo_ELMO_Dataset
wo_elmo_dataset = wo_ELMO_Dataset(path, vocab)

s1_wo_elmo, s2_wo_elmo, scores_wo_elmo = wo_elmo_dataset.format()
scores_wo_elmo = torch.tensor(scores_wo_elmo, dtype=torch.float32)
s1_wo_elmo_val = s1_wo_elmo[:1000]
s2_wo_elmo_val = s2_wo_elmo[:1000]
scores_wo_elmo_val = scores_wo_elmo[:1000]
wo_elmo_val_dataloader = create_dataloader(s1_wo_elmo_val, s2_wo_elmo_val, scores_wo_elmo_val, batch_size)
s1_wo_elmo_train = s1_wo_elmo[1000:]
s2_wo_elmo_train = s2_wo_elmo[1000:]
scores_wo_elmo_train = scores_wo_elmo[1000:]
wo_elmo_train_dataloader = create_dataloader(s1_wo_elmo_train, s2_wo_elmo_train, scores_wo_elmo_train, batch_size)


  scores_wo_elmo = torch.tensor(scores_wo_elmo, dtype=torch.float32)


In [8]:
model = ELMo(cnn_config = {'character_embedding_size': 16, 
                           'num_filters': 32, 
                           'kernel_size': 5, 
                           'max_word_length': 10, 
                           'char_vocab_size': character_vocab.num_chars}, 
             elmo_config = {'num_layers': 3,
                            'word_embedding_dim': 150,
                            'vocab_size': vocab.num_words}, 
             char_vocab_size = character_vocab.num_chars).to(device)


In [9]:
model.load_state_dict(torch.load('../model_elmo_hindi.pt'))

<All keys matched successfully>

In [10]:
word_embedding_dim = 300


class SimilarityModel_ELMO(nn.Module):
    def __init__(self, elmo):
        super(SimilarityModel_ELMO, self).__init__()
        self.elmo = elmo        
        self.lambdas = nn.Parameter(torch.randn(3))
        self.lstm = nn.LSTM(word_embedding_dim, word_embedding_dim//2, batch_first=False, bidirectional=True, num_layers=2)

        for param in self.elmo.parameters():
            param.requires_grad = False
        
    def forward(self, sentence1, sentence2):
        forward_output1, backward_output1, final_embeddings1 = self.elmo(sentence1)
        encoding1 = torch.zeros_like(final_embeddings1[0])
        for i in range(3):
            encoding1 += self.lambdas[i] * final_embeddings1[i]

        forward_output2, backward_output2, final_embeddings2 = self.elmo(sentence2)
        encoding2 = torch.zeros_like(final_embeddings2[0])
        for i in range(3):
            encoding2 += self.lambdas[i] * final_embeddings2[i]
        
        # print(final_embeddings1[0].shape, final_embeddings2[0].shape, encoding1.shape, encoding2.shape)
        # print the embeddings of the first sentence
        # print(sentence1.shape, sentence2.shape)

        lstm_out1, (hidden1, cell1) = self.lstm(encoding1)
        lstm_out2, (hidden2, cell2) = self.lstm(encoding2)
        # print(lstm_out1.shape, lstm_out2.shape)
        last_output1 = lstm_out1[:, -1, :]
        last_output2 = lstm_out2[:, -1, :]
        # print(lstm_out1.shape, lstm_out2.shape)
        lstm_out1 = lstm_out1.view(lstm_out1.size(0), -1)  
        lstm_out2 = lstm_out2.view(lstm_out2.size(0), -1)  

        # Compute the cosine similarity between the reshaped tensors
        cos_sim = (F.cosine_similarity(lstm_out1, lstm_out2, dim=1) + 1)*5/2
        # print((F.cosine_similarity(lstm_out1, lstm_out2, dim=0) + 1)*5/2)
        return cos_sim


In [16]:
word_embedding_dim = 300


class SimilarityModel_wo_ELMO(nn.Module):
    def __init__(self, vocab_size, num_layers=2, bidirectional=True):
        super(SimilarityModel_wo_ELMO, self).__init__()
        # Define the embedding layer
        self.embedding = nn.Embedding(vocab_size, word_embedding_dim)
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(word_embedding_dim, word_embedding_dim//2, num_layers=num_layers,
                            bidirectional=bidirectional, batch_first=True)
        
    def forward(self, sentence1, sentence2):
        # Convert sentence1 and sentence2 from lists of token IDs to tensors
        sentence1 = torch.tensor(sentence1, dtype=torch.long)
        sentence2 = torch.tensor(sentence2, dtype=torch.long)
        
        # Compute embeddings for each sentence
        embedding1 = self.embedding(sentence1)
        embedding2 = self.embedding(sentence2)
        # print(embedding1.shape, embedding2.shape)
        # Process embeddings with LSTM
        lstm_out1, _ = self.lstm(embedding1)
        lstm_out2, _ = self.lstm(embedding2)
        # print(lstm_out1.shape, lstm_out2.shape)
        # Use the final hidden state of each LSTM as sentence representations
        last_output1 = lstm_out1[:, -1, :]
        last_output2 = lstm_out2[:, -1, :]
        
        # Compute cosine similarity between the final outputs
        cos_sim = F.cosine_similarity(last_output1, last_output2, dim=1)
        
        # Optionally, transform the cosine similarity to a different range (you can adjust as needed)
        # cos_sim_transformed = (cos_sim + 1) * 5/2
        
        return cos_sim

In [17]:
# similarity_model = SimilarityModel(model)
similarity_model_ELMO = SimilarityModel_ELMO(model).to(device)
similarity_model_wo_ELMO = SimilarityModel_wo_ELMO(vocab.num_words).to(device)

In [13]:
num_epochs = 10
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(similarity_model_ELMO.parameters(), lr=0.001)


In [14]:
num_epochs = 20

# train the model
for epoch in range(num_epochs):
    similarity_model_ELMO.train()
    total_loss = 0
    for s1, s2, scores in tqdm(train_dataloader):
        s1 = s1
        s2 = s2
        scores = scores.to(device)
        optimizer.zero_grad()
        # print(len(s1[0]))
        # stack the sentences
        s1 = torch.stack(s1, dim=1).to(device)
        s2 = torch.stack(s2, dim=1).to(device)
        outputs = similarity_model_ELMO(s1, s2)
        # print(outputs.squeeze().shape, scores.shape)
        # print(outputs.shape, scores.shape)
        loss = criterion(outputs, scores)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} Loss: {total_loss}")

    similarity_model_ELMO.eval()
    with torch.no_grad():
        total_loss = 0
        # calculate mean difference between predicted and actual scores
        diff = 0

        for s1, s2, scores in val_dataloader:
            s1 = s1
            s2 = s2
            scores = scores.to(device)
            s1 = torch.stack(s1, dim=1).to(device)
            s2 = torch.stack(s2, dim=1).to(device)
            outputs = similarity_model_ELMO(s1, s2)
            loss = criterion(outputs, scores)

            total_loss += loss.item()
            diff += torch.abs(outputs - scores).sum().item()
            # calculate 

        print(f"Validation Loss: {total_loss}" + f" Mean Difference: {diff/len(s1_val)}")

 31%|███       | 23/75 [00:10<00:23,  2.24it/s]


KeyboardInterrupt: 

In [18]:
num_epochs = 20

# train the model
for epoch in range(num_epochs):
    similarity_model_wo_ELMO.train()
    total_loss = 0
    for s1, s2, scores in tqdm(wo_elmo_train_dataloader):
        s1 = s1.to(device)
        s2 = s2.to(device)
        scores = scores.to(device)
        optimizer.zero_grad()
        # print(len(s1[0]))
        # stack the sentences
        outputs = similarity_model_wo_ELMO(s1, s2)
        # print(outputs.squeeze().shape, scores.shape)
        # print(outputs.shape, scores.shape)
        loss = criterion(outputs, scores)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} Loss: {total_loss}")

    similarity_model_wo_ELMO.eval()
    with torch.no_grad():
        total_loss = 0
        # calculate mean difference between predicted and actual scores
        diff = 0

        for s1, s2, scores in wo_elmo_val_dataloader:
            s1 = s1.to(device)
            s2 = s2.to(device)
            # convert scores to float
            
            scores = scores.to(device)
            outputs = similarity_model_wo_ELMO(s1, s2)
            loss = criterion(outputs, scores)

            total_loss += loss.item()
            diff += torch.abs(outputs - scores).sum().item()
            # calculate 

        print(f"Validation Loss: {total_loss}" + f" Mean Difference: {diff/len(s1_val)}")

  sentence1 = torch.tensor(sentence1, dtype=torch.long)
  sentence2 = torch.tensor(sentence2, dtype=torch.long)
100%|██████████| 75/75 [00:02<00:00, 26.48it/s]


Epoch 0 Loss: 393.2680969238281
Validation Loss: 65.50843143463135 Mean Difference: 1.657077995300293


100%|██████████| 75/75 [00:02<00:00, 27.61it/s]


Epoch 1 Loss: 392.36709785461426
Validation Loss: 65.43762755393982 Mean Difference: 1.657077995300293


100%|██████████| 75/75 [00:02<00:00, 27.12it/s]


Epoch 2 Loss: 393.5488407611847
Validation Loss: 65.72825074195862 Mean Difference: 1.6570780029296874


100%|██████████| 75/75 [00:02<00:00, 27.05it/s]


Epoch 3 Loss: 393.09278297424316
Validation Loss: 65.44252419471741 Mean Difference: 1.6570779724121094


100%|██████████| 75/75 [00:02<00:00, 28.48it/s]


Epoch 4 Loss: 393.126344203949
Validation Loss: 66.07914996147156 Mean Difference: 1.657077995300293


100%|██████████| 75/75 [00:02<00:00, 26.59it/s]


Epoch 5 Loss: 393.05983805656433
Validation Loss: 65.79218029975891 Mean Difference: 1.6570779876708985


100%|██████████| 75/75 [00:02<00:00, 26.20it/s]


Epoch 6 Loss: 394.44569301605225
Validation Loss: 65.99908709526062 Mean Difference: 1.657078010559082


100%|██████████| 75/75 [00:02<00:00, 26.91it/s]


Epoch 7 Loss: 392.3798532485962
Validation Loss: 65.80016446113586 Mean Difference: 1.6570779876708985


 53%|█████▎    | 40/75 [00:01<00:01, 26.04it/s]


KeyboardInterrupt: 