<a href="https://colab.research.google.com/github/renkexinmay/LSTM-resposne-classification/blob/master/response_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Note:**

To better classify the responses, it would be helpful if the model can take into account the quantitative relationship between responses and questions.
Thus, the questions and reponses should be seperate in input set.


**Reference:**

LSTM sentiment analysis (Keras): https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e

RNN sentiment analysis (PyTorch): https://github.com/bentrevett/pytorch-sentiment-analysis

**Unsupervised learning:**
...

In [0]:
# 1: LSTM in Keras (ref:  https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e)

## load data
from keras.datasets import imdb

Using TensorFlow backend.


In [0]:
## vectorize the words and sentences + pre-trained vectors


In [0]:
vocabulary_size = 50000 # Top most frequent words to consider. Any less frequent word will appear as oov_char value in the sequence data.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)

print('Loaded data with {} training samples and {} test samples'.format(len(X_train), len(X_test)))
print('Max sentence length: {}'.format(len(max(X_train, key = len))))
print('Min sentence length: {}'.format(len(min(X_train, key = len))))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Loaded data with 25000 training samples and 25000 test samples
Max sentence length: 2494
Min sentence length: 11


In [0]:
## Pad sequences: make the input data have save length (padding short ones with 0)
from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen = max_words)
X_test = sequence.pad_sequences(X_test, maxlen = max_words)


In [0]:
## model
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size = 32
model = Sequential() # a linear stack of layers
model.add(Embedding(vocabulary_size, embedding_size, input_length = max_words)) # add the first layer, embedding each word by a 32-dim vector, instead of 5000-dim one-hot vector
model.add(LSTM(100))
model.add(Dense(1, activation = 'softmax')) #softmax for multi category classification; sigmoid for binary classification

## question: how to determine #layers of LSTM; function of Dense layer; how to get #param of LSTM & Dense

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           1600000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,653,301
Trainable params: 1,653,301
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.compile(loss = 'binary_crossentropy', # categorical_crossentropy for multi category classification; binary_crossentropy for binary classification
             optimizer = 'adam',
             metrics = ['accuracy'])

## question: binary_crossentropy? categorical_crossentropy?

In [0]:
batch_size = 64
num_epochs = 5

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

# train the model
model.fit(X_train2, y_train2, validation_data = (X_valid, y_valid),
batch_size = batch_size, epochs = num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f88bd79a940>

In [0]:
scores = model.evaluate(X_test, y_test, verbose = 0)
print('Test accurary', scores[1])

.

.

.


.


.

.


.

.

.

.

The accuracy is only 0.5. Let's try something else

RNN-based classification using PyTorch

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
!pip install https://github.com/pytorch/text/archive/master.zip
from torchtext import data


Collecting https://github.com/pytorch/text/archive/master.zip
  Downloading https://github.com/pytorch/text/archive/master.zip
[K     \ 880kB 95.0MB/s
Building wheels for collected packages: torchtext
  Running setup.py bdist_wheel for torchtext ... [?25l- \ done
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-8rm4kf_z/wheels/5a/86/3d/30ae7dfdfeb1748bb11b3da173fb9634141fbb39e9e9847317
Successfully built torchtext
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0


In [0]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [0]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:06<00:00, 13.8MB/s]


In [0]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of test examples: {len(test_data)}')

Number of training examples: 25000
Number of test examples: 25000


In [0]:
print(vars(test_data.examples[0]))

{'text': ['Rachael', 'Ray', 'appeals', 'to', 'viewers', 'of', 'all', 'ages', 'and', 'backgrounds', ',', 'beginner', 'cooks', 'or', '"', 'seasoned', '"', 'veterans', '.', 'You', "'ll", 'be', 'dazzled', 'with', 'a', 'variegated', 'presentation', 'of', 'delectable', 'yet', 'time', '-', 'efficient', 'dishes', ',', 'jazzed', 'up', 'with', 'her', 'unique', 'brand', 'of', 'spunk', 'and', 'candor', '.', 'Most', 'importantly', ',', 'this', 'hip', 'chic', 'keeps', 'her', 'audience', 'drawn', 'in', 'by', 'stimulating', 'all', 'five', 'senses', '.', 'Let', 'me', 'explain', '.', 'Her', 'program', 'provides', 'enlightenment', 'to', 'your', 'visual', 'sense', ',', 'auditory', 'sense', ',', 'and', 'sense', 'of', 'feeling', 'through', 'a', 'rich', ',', 'luminous', 'ambient', 'backdrop', ',', 'light', '-', 'hearted', ',', 'casual', ',', 'yet', 'engaging', 'topics', ',', 'eye', '-', 'pleasing', ',', 'appetite', 'wrenching', 'meals', ',', 'and', 'her', 'hearty', 'smile', 'and', 'laugh', ',', 'which', 'wil

In [0]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))


In [0]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 17500
Number of validation examples: 7500


In [0]:
TEXT.build_vocab(train_data, max_size = 25000, vectors = "glove.6B.100d")
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:09, 6.67MB/s]                           
100%|█████████▉| 398651/400000 [00:16<00:00, 24940.40it/s]

In [0]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") # <pad> for blanks in short sentence; <unk> for less frequent words
print(f"Unique tokens in LABELS vocabulary: {len(LABEL.vocab)}")
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABELS vocabulary: 2
[('the', 202312), (',', 192493), ('.', 164175), ('and', 108868), ('a', 108719), ('of', 100745), ('to', 93419), ('is', 75919), ('in', 61168), ('I', 53720), ('it', 53417), ('that', 48849), ('"', 44694), ("'s", 43428), ('this', 42014), ('-', 36969), ('/><br', 35724), ('was', 34871), ('as', 30245), ('with', 29966)]
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(<function _default_unk_index at 0x7f09318cff28>, {'neg': 0, 'pos': 1})


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

In [0]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super(RNN, self).__init__()
    # input dim = voc size = one-hot vec dim
    
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional, dropout = dropout)
    self.fc = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, x):
    
    #x = [sent len, batch size]
    embedded = self.dropout(self.embedding(x))
    
    #embedded = [sen len, batch size, emb dim]
    
    output, (hidden, cell) = self.rnn(embedded)
    
    #output = [sen len, batch size, hid dim]
    #hidden, cell = [num layers * num directions, batch size, hid dim]
    
    # concat the final forward (hidden[-2,:,:] and backward (hidden[-1, :,:])) hidden layers
    # and apply dropout
    
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim = 1))
    
    # hidden = [batch size, hid dim * num directions]
       
    return self.fc(hidden.squeeze(0))

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100 # = pre-trained GloVe vectors loaded
HIDDEN_DIM = 256
OUTPUT_DIM = 1

N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)


In [0]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [0]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0378, -0.0032,  0.2337,  ..., -0.1429, -0.5320, -0.9142],
        [ 0.4106, -0.6026,  0.1699,  ..., -0.7372, -0.0973, -0.1677],
        [ 0.0501, -0.0960, -0.1318,  ...,  0.2636, -0.5630, -0.3210]])

In [0]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss() #grad & loss

model = model.to(device)
criterion = criterion.to(device)
# place them on GPU

In [0]:

def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum()/len(correct)
  return acc

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  
  model.train()
  
  for batch in iterator:
    
    optimizer.zero_grad()
    
    predictions = model(batch.text).squeeze(1)
    
    loss = criterion(predictions, batch.label)
    
    acc = binary_accuracy(predictions, batch.label)
    
    loss.backward()
    
    optimizer.step()
    
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    
    return epoch_loss/ len(iterator), epoch_acc/ len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  
  model.eval()
  
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      
      acc = binary_accuracy(predictions, batch.label)
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss/ len(iterator), epoch_acc/ len(iterator)

In [0]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  
  print(f'| Epoch: {epoch+1:02} | Train Loss:{train_loss:.3f}  | Train Accuracy: {train_acc*100:.2f}% | Val.Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.25}% |')

| Epoch: 01 | Train Loss:0.003  | Train Accuracy: 0.18% | Val.Loss: 0.694 | Val. Acc: 50.73711157350216893746619% |
| Epoch: 02 | Train Loss:0.003  | Train Accuracy: 0.14% | Val.Loss: 0.693 | Val. Acc: 50.73711157350216893746619% |
| Epoch: 03 | Train Loss:0.003  | Train Accuracy: 0.20% | Val.Loss: 0.693 | Val. Acc: 50.75476694915253972339997% |


In [0]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

In [0]:
#predict function