# Sentiment Analysis in Pytorch

In [1]:
#library imports
import sys
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import time


import pickle
from collections import Counter,defaultdict

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error,accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm
tqdm.pandas()
# from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
# PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

## Multiclass Text Classification

We are going to predict item ratings based on customer reviews bsed on this dataset from Kaggle:
https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
#loading the data
# reviews = pd.read_csv("/content/drive/MyDrive/data/NLP_sentiment_analysis_data/train.csv")
reviews = pd.read_csv("train.csv")
print(reviews.shape)
reviews.head()

(50000, 3)


Unnamed: 0.1,Unnamed: 0,reviews,ratings
0,0,"This book was very informative, covering all a...",4
1,1,I am already a baseball fan and knew a bit abo...,5
2,2,I didn't like this product it smudged all unde...,1
3,3,I simply love the product. I appreciate print ...,5
4,4,It goes on very easily and makes my eyes look ...,5


In [4]:
# reviews['Title'] = reviews['Title'].fillna('')
# reviews['Review Text'] = reviews['Review Text'].fillna('')
# reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']

In [5]:
#keeping only relevant columns and calculating sentence lengths
reviews = reviews[['reviews', 'ratings']]
reviews.columns = ['reviews', 'ratings']
reviews['review_length'] = reviews['reviews'].apply(lambda x: len(x.split()))
reviews.head()

Unnamed: 0,reviews,ratings,review_length
0,"This book was very informative, covering all a...",4,10
1,I am already a baseball fan and knew a bit abo...,5,23
2,I didn't like this product it smudged all unde...,1,14
3,I simply love the product. I appreciate print ...,5,13
4,It goes on very easily and makes my eyes look ...,5,13


In [6]:
#changing ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['ratings'] = reviews['ratings'].apply(lambda x: zero_numbering[x])
print(type(reviews['ratings']))
reviews.head()

<class 'pandas.core.series.Series'>


Unnamed: 0,reviews,ratings,review_length
0,"This book was very informative, covering all a...",3,10
1,I am already a baseball fan and knew a bit abo...,4,23
2,I didn't like this product it smudged all unde...,0,14
3,I simply love the product. I appreciate print ...,4,13
4,It goes on very easily and makes my eyes look ...,4,13


In [7]:
# Function for repeatability
def Random(seed_value):
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
#     import os
#     os.environ['PYTHONHASHSEED']=str(seed_value)

    # 2. Set `python` built-in pseudo-random generator at a fixed value
    import random
    random.seed(seed_value)

    # 3. Set `numpy` pseudo-random generator at a fixed value
    import numpy as np
    np.random.seed(seed_value)

#     # 4. Set `tensorflow` pseudo-random generator at a fixed value
#     import tensorflow as tf
#     tf.random.set_seed(seed_value)

Random(10)


In [8]:
#mean sentence length
np.mean(reviews['review_length'])

17.58756

In [9]:
#tokenization
# tok = spacy.load('en')
tok = spacy.load('en_core_web_sm')

def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [10]:
#count number of occurences of each word
counts = Counter()
for index, row in tqdm(reviews.iterrows()):
    counts.update(tokenize(row['reviews']))

50000it [00:08, 6085.60it/s]


In [11]:

# with open('/content/drive/MyDrive/data/NLP_sentiment_analysis_data/count.pickle', 'wb') as outputfile:
#   pickle.dump(counts,outputfile)

In [12]:
# with open('/content/drive/MyDrive/data/NLP_sentiment_analysis_data/count.pickle', 'rb') as inputfile:
#   counts=pickle.load(inputfile)
# with open('count.pickle', 'rb') as inputfile:
#   counts=pickle.load(inputfile)

In [13]:
# #deleting infrequent words
# print("num_words before:",len(counts.keys()))
# for word in list(counts):
#     if counts[word] < 2:
#         del counts[word]
# print("num_words after:",len(counts.keys()))

In [14]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [15]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
#     encoded = [0]*N
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
#     enc1 = [vocab2index.get(word, vocab2index["UNK"]) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded

In [16]:
# reviews['encoded'] = reviews['reviews'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

reviews['encoded'] = reviews['reviews'].progress_apply(lambda x: np.array(encode_sentence(x,vocab2index )))
print(type(reviews['encoded']))
reviews.head()

100%|██████████| 50000/50000 [00:02<00:00, 16788.78it/s]<class 'pandas.core.series.Series'>



Unnamed: 0,reviews,ratings,review_length,encoded
0,"This book was very informative, covering all a...",3,10,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 0, 0, ..."
1,I am already a baseball fan and knew a bit abo...,4,23,"[13, 14, 15, 16, 17, 18, 19, 20, 16, 21, 22, 2..."
2,I didn't like this product it smudged all unde...,0,14,"[13, 31, 32, 33, 2, 34, 35, 36, 9, 37, 38, 39,..."
3,I simply love the product. I appreciate print ...,4,13,"[13, 42, 43, 23, 34, 7, 13, 44, 45, 46, 47, 48..."
4,It goes on very easily and makes my eyes look ...,4,13,"[35, 50, 51, 5, 52, 19, 53, 38, 39, 54, 55, 19..."


In [17]:
#check how balanced the dataset is
Counter(reviews['ratings'])

Counter({3: 6871, 4: 33193, 0: 4059, 1: 2265, 2: 3612})

In [18]:
X = list(reviews['encoded'])
y = list(reviews['ratings'])

# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)

        	
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)


#### Pytorch Dataset

In [19]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.Tensor(self.X[idx]), self.y[idx]

In [20]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [21]:
class FocalLoss(nn.modules.loss._WeightedLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean'):
        super(FocalLoss, self).__init__(weight,reduction=reduction)
        self.gamma = gamma
        self.weight = weight #weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):
        ce_loss = F.cross_entropy(input, target,reduction=self.reduction,weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss


In [22]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    criterion = FocalLoss(gamma=5)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in tqdm(train_dl):
            x = x.long().cuda()
            y = y.long().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
        time.sleep(0.5)
def validation_metrics (model, valid_dl):
    model.eval()
    criterion = FocalLoss(gamma=5)
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y in valid_dl:
        x = x.long().cuda()
        y = y.long()
        y_hat = model(x)
        loss = criterion(y_hat.detach().cpu(), y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred.cpu() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred.cpu(), y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [23]:
batch_size = 128
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

### Transformer with pretrained Glove word embeddings

Download weights from : https://nlp.stanford.edu/projects/glove/

In [24]:
def load_glove_vectors(glove_file="./glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in tqdm(f):
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [25]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in tqdm(word_counts):
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [26]:
glove_path="./glove.6B.200d.txt"
Embed_size=200
word_vecs = load_glove_vectors(glove_path)
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts,Embed_size)

400000it [00:19, 20451.63it/s]
100%|██████████| 16191/16191 [00:00<00:00, 350213.89it/s]


In [27]:
class Txf_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=5)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        self.linear1 = nn.Linear(embedding_dim*70, 70)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(70, 5)
        
        #self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, x):
        mask = torch.eq(x,0)
        x = self.embeddings(x)
        
        #print(x.shape)
        #mask = x.float().masked_fill(x == 0, float('-inf')).masked_fill(x !=0, float(0.0))
        x=x.permute(1,0,2)
        #print(x.shape)
        x=self.transformer_encoder(x,src_key_padding_mask=mask)
        x=x.permute(1,0,2)
        #print(x.shape)
        x=x.flatten(1)
        #print(x.shape)
        x=self.linear1(x)
        x=self.relu(x)
        x=self.linear2(x)
        #print(x)
        #x=self.softmax(x)
        #print(x.shape)
        return x

In [28]:
model_glove = Txf_glove_vecs(vocab_size, Embed_size, pretrained_weights)
model_glove=model_glove.cuda()

In [29]:
train_model(model_glove, epochs=7, lr=0.1)

 38%|███▊      | 133/352 [00:18<00:29,  7.30it/s]


KeyboardInterrupt: 

In [None]:
def test_model (model, test_dl):
    model.eval()
    pred=[]
    y_true=[]
    for x, y in tqdm(test_dl):
        x = x.long().cuda()
        y_true.append(y)
        y_hat = model(x)
        #loss = F.cross_entropy(y_hat.detach().cpu(), y)
        predict = torch.max(y_hat, 1)[1]
        predict=predict.cpu()
        pred.append(predict)
    #print(pred)
    test_acc=accuracy_score(y_true,pred)
    test_recall_score=recall_score(y_true,pred,average='weighted')
    test_precision_score=precision_score(y_true,pred,average='weighted')
    test_f1_score=f1_score(y_true,pred,average='weighted')
    test_confusion_matrix=confusion_matrix(y_true,pred)
                
    print(" Test accuracy is "+str(test_acc))
    print(" test_recall_score is "+str(test_recall_score))
    print(" test_precision_score is "+str(test_precision_score))
    print(" test_f1_score is "+str(test_f1_score))
    print(test_confusion_matrix)  
    return

In [None]:
test_reviews = pd.read_csv("gold_test.csv")
test_reviews = test_reviews[['reviews', 'ratings']]
test_reviews.columns = ['reviews', 'ratings']

In [None]:
#test_batch=len(test_reviews)
test_batch=1
test_reviews['ratings'] = test_reviews['ratings'].apply(lambda x: zero_numbering[x])
test_reviews['encoded'] = test_reviews['reviews'].progress_apply(lambda x: np.array(encode_sentence(x,vocab2index )))
X_test = list(test_reviews['encoded'])
y_test = list(test_reviews['ratings'])
test_ds = ReviewsDataset(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size=test_batch,shuffle=False)
test_model(model_glove, test_dl)

In [None]:
torch.save(model_glove, "./weights_only.pth")

In [None]:
model_test=torch.load("./weights_only.pth")
print(model_test.state_dict)

In [None]:

test_example_inp=["This product is not good","this is stupid"]
test_out=[]
for i in range(len(test_example_inp)):
    test_example=torch.Tensor(encode_sentence(test_example_inp[i],vocab2index )).long().view(1,-1)
    test_prob=nn.functional.softmax(model_test.cpu()(test_example),dim=-1)
    print(test_prob)
    pred=(torch.max(test_prob, 1)[1])+1
    test_out.append(pred)
print(test_out)
    

  
# test_example = (torch.Tensor(encode_sentence(test_example,vocab2index )).long()).view(len(test_example),-1)


# LIME integration

In [None]:
def prob(test_example_inp):
    test_out=[]
    for i in range(len(test_example_inp)):
        test_example=torch.Tensor(encode_sentence(test_example_inp[i],vocab2index )).long().view(1,-1)
        test_prob=(nn.functional.softmax(model_test.cpu()(test_example),dim=-1)).detach().numpy()
        test_out.append(test_prob.reshape(-1))
    return np.array(test_out)

Result=prob(["this is good","this is bad"])
print(Result)

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
class_names=[0,1,2,3,4]
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(text_instance='he was bad', classifier_fn=prob, num_features=3, labels=[4])
print ('Explanation for class %s' % class_names[4])
print ('\n'.join(map(str, exp.as_list(label=4))))

# This will explain for top 2 labels
exp = explainer.explain_instance(text_instance='he was worst',classifier_fn=prob, num_features=3, top_labels=2)
print(exp.available_labels())
exp.show_in_notebook(text=False)