Project of Sentiment Analysis on labelled dataset.

In [3]:
import re
import nltk
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
filepath = 'SJ_Unsupervised_NLP_data.txt'
df = pd.read_csv(filepath,delimiter='\t',names=['review', 'label'])

Pre processing I - Removing punctuations, special characters, etc.

In [0]:
review = df.iloc[:, 0].values
label = df.iloc[:, 1].values

processed_reviews = []

for sentence in range(0, len(review)):
    processed_rev = re.sub(r'[^\w\s]', '', str(review[sentence]))
    processed_rev= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_rev)
    processed_rev = re.sub(r'\s+', ' ', processed_rev, flags=re.I)
    processed_rev = re.sub(r'\d+', ' ', processed_rev)
    processed_rev = processed_rev.lower()
    processed_reviews.append(processed_rev)

In [6]:
data_clean = pd.DataFrame(processed_reviews)
data_clean.columns = ['reviews']
data_clean['senti_score'] = label
data_clean.head()

Unnamed: 0,reviews,senti_score
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


Pre processing II - Removing stopwords, etc.

In [0]:
stopword = stopwords.words('english')

def remove_stopwords(text):
    text = text.split()
    text = [word for word in text if not word in stopword]
    return text

def count_words(words):
  return Counter(words)

def split_in_words(no_punct_text):
    temp = " ".join(no_punct_text)
    split_text = temp.split(' ')
    return split_text

words = split_in_words(np.asarray(data_clean['reviews']))
words = [word for word in words if not word in stopword]
count = count_words(words)
clean_data = data_clean['reviews'].apply(lambda x:remove_stopwords(x))

Making vocabulary out of words in the train data.

In [0]:
def vocabulary(counts):
    vocab = {}
    sorted_counts = sorted(counts, key= counts.get, reverse=True)
    counter = 1
    for word in sorted_counts:
        vocab.update({counter:word})
        counter += 1
    return vocab

def vocabulary_to_integer(vocab):
    vocab_to_int = {}
    counter = 1
    for word in vocab:
        vocab_to_int.update({vocab[word]:counter})
        counter += 1
    return vocab_to_int

vocab = vocabulary(count)
vocab_to_int = vocabulary_to_integer(vocab)

In [0]:
reviews_ints = []
for review in np.asarray(clean_data):
    reviews_ints.append([vocab_to_int[word] for word in review])

In [0]:
def pad_features(reviews_ints, seq_length):
    padded_list = []
    for review in reviews_ints:
        if len(review) < seq_length:
            diff = seq_length - len(review)
            review = [0]*diff + review
        padded_list.append(review[:seq_length])
    return np.asarray(padded_list)

seq_length = len(max(reviews_ints, key=len))

features = pad_features(reviews_ints, seq_length=seq_length)

Wrapping the data in torch TensorDataset.

In [0]:
X = features
y = np.asarray(data_clean['senti_score'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

batch_size_train = 35
batch_size_test = 30

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size_train)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size_test)

LSTM model definition which will be used to perform Sentiment Analysis.

In [0]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentLSTM, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)

        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] 
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [43]:
train_on_gpu=torch.cuda.is_available()

vocab_size = len(vocab_to_int)+1 
output_size = 1
embedding_dim = 16
hidden_dim = 128
n_layers = 2

net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentLSTM(
  (embedding): Embedding(1744, 16)
  (lstm): LSTM(16, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [0]:
lr=0.01

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [72]:
epochs = 4 

counter = 0
print_every = 4
clip=5 

if(train_on_gpu):
    net.cuda()

net.train()

for e in range(epochs):
    h = net.init_hidden(batch_size_train)
  
    for inputs, labels in train_loader:
        counter += 1
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()
        h = tuple([each.data for each in h])
        net.zero_grad()
        output, h = net(inputs, h)

        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if counter % print_every == 0:
          print("Epoch: {}/{}...".format(e+1, epochs),
                "Step: {}...".format(counter),
                "Loss: {:.6f}...".format(loss.item()))

Epoch: 1/4... Step: 4... Loss: 0.657721...
Epoch: 1/4... Step: 8... Loss: 0.676270...
Epoch: 1/4... Step: 12... Loss: 0.683351...
Epoch: 1/4... Step: 16... Loss: 0.660371...
Epoch: 1/4... Step: 20... Loss: 0.705865...
Epoch: 2/4... Step: 24... Loss: 0.655850...
Epoch: 2/4... Step: 28... Loss: 0.612159...
Epoch: 2/4... Step: 32... Loss: 0.534427...
Epoch: 2/4... Step: 36... Loss: 0.704319...
Epoch: 2/4... Step: 40... Loss: 0.533397...
Epoch: 3/4... Step: 44... Loss: 0.555116...
Epoch: 3/4... Step: 48... Loss: 0.400504...
Epoch: 3/4... Step: 52... Loss: 0.362035...
Epoch: 3/4... Step: 56... Loss: 0.458963...
Epoch: 3/4... Step: 60... Loss: 0.348555...
Epoch: 4/4... Step: 64... Loss: 0.249374...
Epoch: 4/4... Step: 68... Loss: 0.134352...
Epoch: 4/4... Step: 72... Loss: 0.497276...
Epoch: 4/4... Step: 76... Loss: 0.429628...
Epoch: 4/4... Step: 80... Loss: 0.074242...


In [73]:
test_losses = []
num_correct = 0

h = net.init_hidden(batch_size_test)

net.eval()

for inputs, labels in test_loader:
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    output, h = net(inputs, h)
    
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    pred = torch.round(output.squeeze())
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.699
Test accuracy: 0.693


Predicting on a test statement.

In [0]:
test_review_neg = 'bad phone.'

In [75]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() 
    test_text = ''.join([c for c in test_review if c not in punctuation])
    test_words = test_text.split()
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints
    
test_ints = tokenize_review(test_review_neg)
print(test_ints)

[[58, 2]]


In [76]:
features = pad_features(test_ints, seq_length)

feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 16])


In [0]:
def predict(net, test_review, sequence_length=200):
    
    net.eval()

    test_ints = tokenize_review(test_review)
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)

    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    output, h = net(feature_tensor, h)

    pred = torch.round(output.squeeze()) 
    print('Prediction value: {:.6f}'.format(output.item()))
    
    if(pred.item()==1):
        print("Positive review!")
    else:
        print("Negative review!")

In [78]:
predict(net, test_review_neg, seq_length)

Prediction value: 0.022982
Negative review!
