In [86]:
import pandas as pd
import torch
import torch.nn.functional as F
# !pip install torchtext==0.10.0
import torchtext
import torchtext.legacy
import random

In [87]:
# emotions = ['worry','neutral','happiness','sadness','love','surprise']
TEXT = torchtext.legacy.data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm',use_vocab=True, batch_first=True, include_lengths=True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

In [88]:
fields = [('sentiment', LABEL),('lemma_str', TEXT)]
dataset = torchtext.legacy.data.TabularDataset(path='filename2.csv', format='csv',skip_header=True, fields=fields)

In [89]:
RANDOM_SEED=42
train_data, test_data = dataset.split(split_ratio=[0.8, 0.2],random_state=random.seed(RANDOM_SEED))
train_data, valid_data = train_data.split(split_ratio=[0.85, 0.15],random_state=random.seed(RANDOM_SEED))

In [90]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# train_y = le.fit_transform(train_y.values)
# test_y = le.transform(test_y.values)

In [92]:
VOCABULARY_SIZE=5000
vectors = torchtext.vocab.Vectors(name='glove.6B.50d.txt')
TEXT.build_vocab(train_data,vectors=vectors,min_freq=4,max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

In [109]:
print(list(LABEL.vocab.stoi.items()))

[('worry', 0), ('neutral', 1), ('sadness', 2), ('happiness', 3), ('love', 4), ('surprise', 5)]


In [94]:
import torch
import torchtext.vocab as vocab
glove = vocab.GloVe(name='6B', dim=50)

print('Loaded {} words'.format(len(glove.itos)))

def get_word(word):
    return glove.vectors[glove.stoi[word]]

Loaded 400000 words


In [95]:
# Didn't use this method but tried this
import numpy as np
matrix_len = VOCABULARY_SIZE
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(TEXT.vocab.stoi.items()):
    try: 
        weights_matrix[i] = get_word(word[0])
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(50, ))

print(weights_matrix.shape)

(5000, 50)


In [96]:
def embedding_layer(weights_matrix, non_trainable=False):
    num_embeddings = weights_matrix.shape[0]
    embedding_dim = weights_matrix.shape[1]
    emb_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(weights_matrix))
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer

In [110]:
LEARNING_RATE = 0.005
BATCH_SIZE = 16
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
NUM_CLASSES = 6

In [111]:
train_loader, valid_loader, test_loader = torchtext.legacy.data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=False,
        sort_key=lambda x: len(x.lemma_str), device=DEVICE
)

In [119]:
import torch.nn as nn
class LSTM(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,n_layers):
        super().__init__()
        self.n_layers=n_layers
        self.embedding = torch.nn.Embedding(input_dim,embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim,n_layers,bidirectional=True,batch_first=True)        
        self.fc = torch.nn.Linear(hidden_dim*2, output_dim)
        
    def forward(self, text, batch_len):
        self.embedding.weight.data.copy_(TEXT.vocab.vectors)
        embedded = self.embedding(text)
        pack_out = nn.utils.rnn.pack_padded_sequence(
            embedded, batch_len,batch_first=True,enforce_sorted=False)
        output, (hidden, cell) = self.lstm(pack_out)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1)
        output = self.fc(hidden)
        return output

In [120]:
torch.manual_seed(RANDOM_SEED)
model = LSTM(input_dim=len(TEXT.vocab),embedding_dim=EMBEDDING_DIM,hidden_dim=HIDDEN_DIM,output_dim=NUM_CLASSES,n_layers=2)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [121]:
#Training
for epoch in range(NUM_EPOCHS):
    for batch_idx, batch in enumerate(train_loader):
        data, length = batch.lemma_str
        targets = batch.sentiment
        
        scores = model(data,length)
        loss = F.cross_entropy(scores, targets)

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch} is done")

Epoch 0 is done
Epoch 1 is done
Epoch 2 is done
Epoch 3 is done
Epoch 4 is done
Epoch 5 is done
Epoch 6 is done
Epoch 7 is done
Epoch 8 is done
Epoch 9 is done
Epoch 10 is done
Epoch 11 is done
Epoch 12 is done
Epoch 13 is done
Epoch 14 is done


In [124]:
torch.save(model, "/Users/rames/Documents/IITB/Others/SOC 2022/final_model_with_5_emotions.pt")

In [117]:
# average_loss=0
# for batch_idx, batch in enumerate(valid_loader):
#     data_test = batch.lemma_str.to(device=DEVICE)
#     actual = batch.sentiment.to(device=DEVICE)

#     pred = model(data_test)
#     average_loss += F.cross_entropy(pred, actual)
#     print(pred,actual)
#     break

In [123]:
import numpy as np
import sklearn.metrics
val_loss = 0.

predlist=torch.zeros(0,dtype=torch.long, device='cpu')
lbllist=torch.zeros(0,dtype=torch.long, device='cpu')

for batch_idx, batch in enumerate(test_loader):
    data, length = batch.lemma_str
    y_pred = model(data,length)
    val_loss += F.cross_entropy(y_pred, batch.sentiment).item()

    val_preds =F.softmax(y_pred)
    val_preds = val_preds.argmax(axis=1)
#     print(val_preds)
#     break
    predlist=torch.cat([predlist,val_preds.view(-1).cpu()])
    lbllist=torch.cat([lbllist,batch.sentiment.view(-1).cpu()])
        
#     print(val_preds,batch.sentiment.numpy())

conf_mat=sklearn.metrics.confusion_matrix(lbllist.numpy(), predlist.numpy())
print(conf_mat)

# Per-class accuracy
class_accuracy=100*conf_mat.diagonal()/conf_mat.sum(1)
print(class_accuracy)

  val_preds =F.softmax(y_pred)


[[677 436 254 136  76  80]
 [383 758 115 223 104  69]
 [397 210 230  77  43  31]
 [202 276  57 372 125  56]
 [102 106  37 204 246  36]
 [105 158  32  84  40  32]]
[40.80771549 45.88377724 23.27935223 34.19117647 33.65253078  7.09534368]
