In [1]:
import numpy as np
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader

#### load data

In [2]:
# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
reviews[0:1000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [4]:
labels[:20]

'positive\nnegative\npo'

> note reviews and labels are split by \n character. 

#### Data Prep
* lowercase
* remove punctuation
* encode words to integers
* encode labels to 0 and 1
* remove outliers
* pad and truncate reviews
* train validation test splits
* convert data sets into torch datasets and prep batches

In [5]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
reviews = reviews.lower() # lowercase, standardize
revs = [rev for rev in reviews if rev not in punctuation] # loops through each character in the reviews
all_text = ''.join(revs) # put them all back together

In [7]:
all_text[0:100]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life '

In [8]:
words = all_text.split() # list of words

In [9]:
words[0]

'bromwell'

##### encoding:
 * get word frequencies
 * create dictionary as word -> integer where most frequent word get the integer 1 and the least frequent get the len(unique_words)+1

In [10]:
word_counts_dict = Counter(words) # get word counts
word_counts = {k: v for k, v in sorted(word_counts_dict.items(), reverse=True,key=lambda item: item[1])} # sort by counts

> note: to speed up, removing most common words

In [15]:
all_text[0:1000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent 

In [16]:
word_counts_ ={}
m=0
for key, value in word_counts.items():
    if m>30:
        word_counts_[key] = value
    m+=1

In [17]:
#word to int
word_to_int = {}
int_ = 1
for key,value in word_counts_.items():
    word_to_int[key] = int_
    int_+=1

In [18]:
# convert words in the reviews into numbers
reviews_split = all_text.split('\n')
reviews_int = []
print(len(reviews_split))
for review in reviews_split:
    review_int =[]
    words = review.split()
    for word in words:
        try:
            review_int.append(word_to_int[word])
        except:
            review_int.append(0) # replace the removed words with a 0
    reviews_int.append(review_int)

25001


In [19]:
# print tokens in first review
print('Tokenized review: \n', reviews_int[:1])

Tokenized review: 
 [[20994, 277, 0, 0, 1019, 176, 0, 2107, 1, 0, 140, 26, 0, 18, 50, 5754, 13, 351, 79, 109, 0, 5163, 29, 123, 0, 0, 4944, 5821, 444, 40, 0, 229, 0, 20994, 277, 0, 1947, 0, 43, 2364, 0, 582, 42, 0, 5163, 0, 24072, 0, 1952, 10135, 0, 5755, 1468, 5, 20, 35, 173, 114, 36, 1168, 5163, 19838, 0, 37411, 0, 0, 190, 852, 0, 2957, 40, 0, 0, 5756, 0, 655, 0, 36, 1468, 23, 0, 185, 0, 352, 0, 31, 0, 1375, 3655, 752, 0, 3452, 149, 0, 351, 0, 1181, 13552, 1, 277, 0, 318, 310, 2882, 0, 112, 96, 0, 7659, 0, 0, 98, 5163, 1375, 2295, 0, 20994, 277, 0, 497, 0, 78, 1417, 0, 29, 512, 71, 0, 20994, 277, 0, 196, 4115, 17, 0, 2180, 0, 0, 184, 0]]


##### encode labels

In [20]:
labels = labels.split('\n')
label_counts = Counter(labels)

In [21]:
label_counts

Counter({'positive': 12500, 'negative': 12500, '': 1})

In [22]:
encoded_labels = []
for label in labels:
    if label == 'positive':
        encoded_labels.append(0)
    else:
        encoded_labels.append(1)

In [23]:
sum(encoded_labels),len(encoded_labels)

(12501, 25001)

##### remove outliers

In [24]:
# only check for zero lenght reviews at this point
review_lengths = np.array([len(review) for review in reviews_int])
zero_index = np.argwhere(review_lengths==0)
zero_index

array([[25000]], dtype=int64)

In [25]:
np.mean(review_lengths),np.median(review_lengths),np.max(review_lengths),np.min(review_lengths)

(240.79820807167712, 179.0, 2514, 0)

In [26]:
# remove zero length reviews and labels
for index_ in zero_index:
    print(index_[0])
    encoded_labels.pop(index_[0])
    reviews_int.pop(index_[0])


25000


In [27]:
len(encoded_labels),len(reviews_int)

(25000, 25000)

##### pad or truncate
* add 0s to the left if review is short
* cut from right if review is long

In [28]:
def pad_trun(reviews, seq_length):
    
    features = np.zeros((len(reviews),seq_length),dtype=int)
    for j, review in enumerate(reviews):
        features[j,-len(review):] = review[:seq_length]
    return features

In [29]:
seq_length = 100 

features = pad_trun(reviews_int, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(reviews_int), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."
# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[20994   277     0     0  1019   176     0  2107     1     0]
 [   32     0     0    94     5    16  7441  1364     0     0]
 [22351    11 46387     0   675 17108  3358    16    46     4]
 [ 4474   474     0     0  3311   131  8281  1621     0  4788]
 [  489    88    82     3 16341  1785  3706    86   854 20999]
 [    0     0  3606   110     0   391     0   241    29  4324]
 [    0     0   661     0    59  2125     0 11697     0  2787]
 [  755   264     0    91     0     0   388     0     0     4]
 [    0     0     0     0   748  3656  2787     0     0     0]
 [   23     0     0    85    29   767   521    40   333     0]
 [    0   184     0     0  1655  2038  1534   836     0     0]
 [    0   694     0    78  1353   137     0   291     0     0]
 [  384    61     4   451     0  2904    63     0   508  1734]
 [    0   299   547     3     0   131   717  2700     0   294]
 [    0     0 10140  5274  1915   658   413     0   249   642]
 [    0    58     0    91     5     0  1770     0 11702

##### train valid test split
* train 80%
* valid 10%
* test 10%

In [31]:
encoded_labels = np.array(encoded_labels)
train_size = int(0.7*len(features))
train_x, rest_x = features[:train_size],features[train_size:]
train_y, rest_y = encoded_labels[:train_size],encoded_labels[train_size:]
val_x, test_x = rest_x[:int(0.5*len(rest_x))],rest_x[int(0.5*len(rest_x)):]
val_y, test_y = rest_y[:int(0.5*len(rest_x))],rest_y[int(0.5*len(rest_x)):]

In [32]:
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_y.shape), 
      "\nValidation set: \t{}".format(val_y.shape),
      "\nTest set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(17500, 100) 
Validation set: 	(3750, 100) 
Test set: 		(3750, 100)
			Feature Shapes:
Train set: 		(17500,) 
Validation set: 	(3750,) 
Test set: 		(3750,)


##### dataloader

In [33]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# shuffle sets 
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [34]:
# get a sample out from dataloaders
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print(sample_x[0])
print(sample_y[0])

tensor([ 2054,     0,     0,   274,    32,     0,     0,  2975,    84,     0,
            0,   386,     0,     0,  3161,     0,    26,     0,  7463,   290,
         2743,   865,     0,    16,     0,  1244,   195,  5746,  7785,   265,
         1999,  1937,  1516,     0,   209,  2480,   901,    14,     0,    79,
         3117,     0,    10,  1394,    10,   296,  6532,     0, 33859,    22,
         9489,    10,    53,  1262,  2762,  3759,   265,     0,   153,  1999,
          696,     0,  6100, 14193,     0,  1659,  1394,  4684,   209,  2741,
         5939,  6171,  6497,     0, 11991,  8334,  4536,   317,     0, 14639,
            0,   115,    32,     7,     0, 26636, 16889,    10,    53,  1262,
         3006,   155,  1990, 15094,  2235,  2426,     0,    94,    10,    86],
       dtype=torch.int32)
tensor(0, dtype=torch.int32)


## Model

In [35]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [36]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

## Instantiate the model

In [37]:
len(word_to_int)

74041

In [38]:
vocab_size = len(word_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 50
hidden_dim = 64
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(74042, 50)
  (lstm): LSTM(50, 64, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [39]:
# loss and optimization functions
lr=0.01

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [40]:
# training params

epochs = 1

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/1... Step: 100... Loss: 0.693606... Val Loss: 0.693245
Epoch: 1/1... Step: 200... Loss: 0.655229... Val Loss: 0.652904
Epoch: 1/1... Step: 300... Loss: 0.599601... Val Loss: 0.628703


#### Test Results

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))


## Test the model on a fake review

In [None]:
positive_review = 'this is one of the good ones. Everyone should consider watching it. The story, acting and effects were top class.'
positive_review2 = 'Despite many commented that this moview is garbage or too long or not worthy, I went to see it anyways. And it blew my mind one of the best movies I have seen in recent year.'

In [None]:
# todo check if the words exist in the dictionary

In [None]:
# todo handle words that are not included in the dictionary

In [None]:
def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    
    words = review.split()
    for word in test_words:
        try:
            test_ints.append(word_to_int[word])
        except:
            test_int.append(0) # replace words not found in the dictionary with 0
    

    return test_ints

In [None]:
# test sequence padding

features = pad_features(test_ints, seq_length)

print(features)

feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

In [None]:
def predict(net, test_review, sequence_length=100):
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")