# Sentence Emotion Detection Model

This notebook contains code we used to train our model that uses sentiment analysis to predict the emotion of the use based on a journal like sentence used as input

## Dataset

Our datasets 

Install SpaCy and other libraries


In [1]:
!pip install --upgrade torch==1.7.1 torchtext==0.8.1 torchvision==0.8.2

Requirement already up-to-date: torch==1.7.1 in /usr/local/lib/python3.7/dist-packages (1.7.1)
Requirement already up-to-date: torchtext==0.8.1 in /usr/local/lib/python3.7/dist-packages (0.8.1)
Requirement already up-to-date: torchvision==0.8.2 in /usr/local/lib/python3.7/dist-packages (0.8.2)


In [2]:
import torch, torchtext
from torch import nn, optim, functional as F
import pandas as pd, csv
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pdb

Import tweets.csv from public dropbox link

In [3]:
!wget -O tweets.csv https://www.dropbox.com/s/i0nb9una5iba8u8/tweet_emotions_orig.csv?dl=0

--2021-04-05 22:17:22--  https://www.dropbox.com/s/i0nb9una5iba8u8/tweet_emotions_orig.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/i0nb9una5iba8u8/tweet_emotions_orig.csv [following]
--2021-04-05 22:17:22--  https://www.dropbox.com/s/raw/i0nb9una5iba8u8/tweet_emotions_orig.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc05f1024eb657e257e365b65a7e.dl.dropboxusercontent.com/cd/0/inline/BMGrglborT7azjtNJUlqETnIWjDxAR_fPgWKhnx1_xPpxRs4idMwID1U27c_kbN53QPvBdgEBVBMB_lXyFCdAG3UxAnGhT4qOMpRnl2J_6aMS-pcTX_abluqpb8YuGpOk-5tbWWq75u3PlVVhzLzDSny/file# [following]
--2021-04-05 22:17:23--  https://uc05f1024eb657e257e365b65a7e.dl.dropboxusercontent.com/cd/0/inline/BMGrglborT7azjtNJUlqETnIWjDxAR_fPgWKhnx1_xPpxRs4idMw

In [4]:
tweets = pd.read_csv('/content/tweets.csv')

In [5]:
tweets

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


Sentiments into an array for later use

In [6]:
tweets.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [7]:
sentiment = ['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger']

Define Dataset for tweets

In [8]:
class Sentences(torch.utils.data.Dataset):
    def __init__(self, fn):
        lengths = []
        convert = { u: n for n, u in enumerate(fn['sentiment'].unique()) }
        fn['sentiment'] = fn['sentiment'].apply(lambda u: convert[u])              # 12 unique words should be assigned integers starting from 0
        tokenizer = torchtext.data.utils.get_tokenizer('spacy', 'en_core_web_sm')  # tokenizer using spaCy
        for i in range(len(tweets['content'])):
          lengths.append(len(tokenizer(tweets['content'].iat[i])))                 # store the number of tokens in each tweet to beused in getitem
        string = ' '.join([tweets['content'].iat[i] 
                           for i in range(len(tweets['content']))])                # combine everything into one single string
        toks = tokenizer(string)                                                   # tokenize the single string

        self.vocab = torchtext.vocab.build_vocab_from_iterator([toks])
        self.sentiment = fn['sentiment'].values
        self.text = fn['content'].values
        self.length = lengths
        self.toks = torch.LongTensor([self.vocab[tok] for tok in toks])

    def __len__(self):
        return len(self.length)

    def __getitem__(self, i):
        sum = 0
        for x in range(i):
          sum += self.length[x]
        return (self.sentiment[i], self.toks[sum: sum + self.length[i]])          # return the sentiment and related tokns for a specific tweet

In [9]:
ds_full = Sentences(tweets)
n_train = int(0.8 * len(ds_full))
n_test = len(ds_full) - n_train
rng = torch.Generator().manual_seed(291)
ds_train, ds_test = torch.utils.data.random_split(ds_full, [n_train, n_test], rng)

1lines [00:00,  9.37lines/s]


Check outputs are correct

In [10]:
ds_test[100]

(4,
 tensor([  83,   20,    9,  140,   37,    7,  390,    8,  230, 3497,   24,   11,
          303,    3]))

In [11]:
' '.join([ds_full.vocab.itos[x] for x in ds_test[100][1]])

"It 's a sad day , found the first scratch on my car ."

In [12]:
sentiment[ds_test[100][0]]

'worry'

In [13]:
len(ds_full.toks)

648583

MODEL

Model with embedding and LSTM

In [14]:
class SentenceModel(nn.Module):
      def __init__(self, vocab_size, embedding_dim, lstm_dim, n_cats, n_layers = 2, drop_prob = 0.5):
        super().__init__()                                                      #constructor for parent class
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)          #use word embeddings 
        self.lstm = torch.nn.LSTM(embedding_dim, lstm_dim, n_layers,
                                  dropout=drop_prob, batch_first=True)          #LSTM layer
        self.linear = nn.Linear(lstm_dim, n_cats)
        nn.init.xavier_uniform_(self.embedding.weight.data)
        nn.init.xavier_uniform_(self.linear.weight.data)
        
      def forward(self, text):
        emb = self.embedding(text)
        lstm_out, _ = self.lstm(emb)
        out = self.linear(lstm_out)
        return torch.mean(out, dim=1)

In [15]:
device = torch.device('cpu')

def run_test(model, ds, crit):
    preds = []                                                                  # array to store predictions
    batch_size = 1                                                              # change batch size here
    model.eval()
    total_loss, total_acc = 0, 0
    ldr = torch.utils.data.DataLoader(ds)
    for labs, txts in ldr:                                                
        labs, txts = labs.to(device), txts.to(device)
        with torch.no_grad():
            outs = model(txts)
            loss = crit(outs, labs)
            total_loss += loss.item()
            total_acc += (outs.argmax(1) == labs).sum().item()
            preds.append(outs.argmax(1))                                        # append all the predictions to an array
    return total_loss / len(ds), total_acc / len(ds), preds, batch_size         # added array return value 'preds' and batchsize

def run_train(model, ds, crit, opt, sched):
    model.train()
    total_loss, total_acc = 0, 0
    ldr = torch.utils.data.DataLoader(ds)
    for labs, txts in tqdm(ldr, leave=False, desc='train iter'):          
        opt.zero_grad()
        labs, txts = labs.to(device), txts.to(device)
        outs = model(txts)
        loss = crit(outs, labs)
        loss.backward()
        opt.step()
        total_loss += loss.item()
        total_acc += (outs.argmax(1) == labs).sum().item()
    sched.step()
    return total_loss / len(ds), total_acc / len(ds)

def run_all(model, test_ds, train_ds, crit, opt, sched, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs'):
        train_loss, train_acc = run_train(model, train_ds, crit, opt, sched)
        test_loss, test_acc, _, _ = run_test(model, test_ds, crit)
        tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f} acc {train_acc:.4f}   test loss {test_loss:.6f} acc {test_acc:.4f}')   

In [16]:
model = SentenceModel(len(ds_full.vocab), 32, 1, len(tweets.sentiment.unique()))
model.to(device);
crit = nn.CrossEntropyLoss().to(device)
opt = optim.SGD(model.parameters(), lr=1.0)
sched = optim.lr_scheduler.StepLR(opt, 10, gamma=0.1)

In [None]:
run_all(model, ds_test, ds_train, crit, opt, sched, 10)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=10.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='train iter', max=32000.0, style=ProgressStyle(description…

epoch 0   train loss 2.373658 acc 0.1824   test loss 2.281572 acc 0.2111


HBox(children=(FloatProgress(value=0.0, description='train iter', max=32000.0, style=ProgressStyle(description…

epoch 1   train loss 2.366305 acc 0.1838   test loss 2.281168 acc 0.2190


HBox(children=(FloatProgress(value=0.0, description='train iter', max=32000.0, style=ProgressStyle(description…

epoch 2   train loss 2.366858 acc 0.1852   test loss 2.278407 acc 0.2155


HBox(children=(FloatProgress(value=0.0, description='train iter', max=32000.0, style=ProgressStyle(description…

epoch 3   train loss 2.365404 acc 0.1865   test loss 2.278868 acc 0.2144


HBox(children=(FloatProgress(value=0.0, description='train iter', max=32000.0, style=ProgressStyle(description…