# Sentiment Analysis of Restaurant Review


## Imports

In [1]:
import math
import numpy as np
import pandas as pd
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
np.random.seed(1701)
torch.manual_seed(1701)
torch.cuda.manual_seed_all(1701)     

## Data processing

In [3]:
def chinese_word_cut(mytext):
    return " ".join(jieba.cut(mytext))

def make_label(star):
    if star > 3:
        return 1
    else:
        return 0
    
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

In [4]:
class ReviewDataset(Dataset):
    def __init__(self, data):
        self.n_samples, self.n_features = data.shape
        # The first column is label, the rest are the features
        self.n_features -= 1 
        self.feature = torch.from_numpy(data[:, :-1].astype(np.float32)).cuda() # size [n_samples, n_features]
        self.label = torch.from_numpy(data[:, [-1]].astype(np.float32)).cuda() # size [n_samples, 1]        
    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.feature[index], self.label[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

### Read data and add columns

In [5]:
data = pd.read_csv('data0.csv', nrows=3000, usecols=['star','comment'])
data['sentiment'] = data.star.apply(make_label)
data['cut_comment'] = data.comment.apply(chinese_word_cut)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Alex\AppData\Local\Temp\jieba.cache
Loading model cost 0.542 seconds.
Prefix dict has been built successfully.


### Vectorization

In [14]:
stop_words_file = '哈工大停用词表.txt'
stopwords = get_custom_stopwords(stop_words_file)

vect = CountVectorizer(max_df = 1.0, 
                       min_df = 1, 
                       max_features = None,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', 
                       stop_words=frozenset(stopwords))

vect1 = TfidfVectorizer(#max_features = 5000,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', 
                       stop_words=frozenset(stopwords))
X = pd.DataFrame.sparse.from_spmatrix(vect.fit_transform(data['cut_comment']), columns=vect.get_feature_names())
#%time X = pd.DataFrame.sparse.from_spmatrix(vect1.fit_transform(data['cut_comment']), columns=vect1.get_feature_names())
Xy = X.assign(label_y=data.sentiment)
train, test = train_test_split(Xy.values, test_size=0.1)
train, test = ReviewDataset(train), ReviewDataset(test)
dataset = train



# Arguments

In [15]:
batch = 10
train_loader = DataLoader(dataset=dataset,
                            batch_size=batch,
                            shuffle=True,
                            num_workers=0) # This gave me errors if num-workers is not 0. I don't have time to learn why.
#dataiter = iter(train_loader)

num_epochs = 3
learning_rate = 0.1
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/float(batch))

## Model

## Logreg

In [16]:
class LogReg(nn.Module):
    def __init__(self, n_input_features):
        super(LogReg, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x)) # Label has only two categories, so sigmoid and softmax should be essentially the same.
        return y_pred

def evaluate(data, model):
    with torch.no_grad():
        y_predicted = model(data.feature)
        y_predicted_cls = y_predicted.round()
        acc = y_predicted_cls.eq(data.label).sum() / float(data.label.shape[0])
        return acc

def weight_reset(m):
    # I grab this func from website. It is called to compare optimizers.
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        m.reset_parameters()    

## Initialization

In [17]:
model_logreg = LogReg(train.n_features)
model_logreg.apply(weight_reset)
if torch.cuda.is_available():
    device = torch.device("cuda")
    model_logreg.to(device)
#optimizer=torch.optim.SGD(model_logreg.parameters(), lr=learning_rate)
#optimizer=torch.optim.Adam(model_logreg.parameters(), lr=learning_rate)
criterion = nn.BCELoss()

## Train

In [18]:
def Train(model, data = train_loader, num_epochs = 3, evaluate_step = 50, printacc = True):
    pcount = 0
    performance = np.zeros([n_iterations // evaluate_step * num_epochs,1])
    optimizer=torch.optim.SGD(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(data):
            # Run your training process
            y_pred = model(inputs)
            loss = criterion(y_pred, labels)
            # Backward pass and update
            loss.backward()
            optimizer.step()
            # zero grad before new step
            optimizer.zero_grad()
            if printacc == True:
                if (i+1) % evaluate_step == 0:
                    acc_train = evaluate(train,model)
                    acc_test = evaluate(test,model)
                    performance[pcount]=acc_test.item() # Save performance for plots
                    pcount +=1
                    print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations}, loss = {loss.item():.4f}, acc = {acc_train.item():.4f}, acc_test = {acc_test.item():.4f}')
    return performance


In [19]:
%time performance_tf = Train(model_logreg, num_epochs = 3, evaluate_step = 50)

Epoch: 1/3, Step 50/270, loss = 0.5761, acc = 0.6715, acc_test = 0.7000
Epoch: 1/3, Step 100/270, loss = 0.5744, acc = 0.6785, acc_test = 0.6800
Epoch: 1/3, Step 150/270, loss = 0.4925, acc = 0.7030, acc_test = 0.6933
Epoch: 1/3, Step 200/270, loss = 0.5320, acc = 0.7181, acc_test = 0.7033
Epoch: 1/3, Step 250/270, loss = 0.7662, acc = 0.7670, acc_test = 0.7400
Epoch: 2/3, Step 50/270, loss = 0.6236, acc = 0.7844, acc_test = 0.7467
Epoch: 2/3, Step 100/270, loss = 0.6295, acc = 0.7659, acc_test = 0.7367
Epoch: 2/3, Step 150/270, loss = 0.5185, acc = 0.7807, acc_test = 0.7467
Epoch: 2/3, Step 200/270, loss = 0.4782, acc = 0.7904, acc_test = 0.7600
Epoch: 2/3, Step 250/270, loss = 0.6000, acc = 0.7944, acc_test = 0.7433
Epoch: 3/3, Step 50/270, loss = 0.5562, acc = 0.7900, acc_test = 0.7533
Epoch: 3/3, Step 100/270, loss = 0.5017, acc = 0.7996, acc_test = 0.7533
Epoch: 3/3, Step 150/270, loss = 0.4592, acc = 0.8167, acc_test = 0.7600
Epoch: 3/3, Step 200/270, loss = 0.5777, acc = 0.8022,

## Test

In [20]:
acc_test = evaluate(test,model_logreg)
print(f'Accuracy: {acc_test.item():.4f}')

Accuracy: 0.7567


In [21]:
observed = np.array(test.label.cpu(),dtype=object)
expected = model_logreg(test.feature).cpu().detach().numpy().round()

In [22]:
tp,fp,tn,fn=0,0,0,0
o = observed
e = expected
for i in range(len(test)):
    if o[i] == 1 and e[i] == 1:
        tp +=1
    if o[i] == 1 and e[i] == 0:
        fp +=1
    if o[i] == 0 and e[i] == 0:
        tn +=1
    if o[i] == 0 and e[i] == 1:
        fn +=1

In [23]:
accuracy = (tp + tn)/(tp+fp+tn+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2*precision*recall/(precision+recall)
accuracy,precision,recall,f1

(0.7566666666666667,
 0.8412698412698413,
 0.7871287128712872,
 0.8132992327365729)

In [24]:
tp,fp,tn,fn

(159, 30, 68, 43)

## Interpretability

In [None]:
def interpret(model, number):
    name1 = []
    param1= []
    i = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            name1.append(name)
            param1.append(param.data) 
    weights = param1[0].cpu()      
    weights1, indices = torch.sort(weights, descending= True)
    indices = indices.numpy().tolist()
    # Top 20 words
    print("Influential words in Positive Reviews:")
    print("--------------------------------------")
    for i in range(number):
        #print(indices[0][i])
        print(X.columns[indices[0][i]])

    print("====\n\n\n")
    # Top 20 negative words
    print("Influential words in Negative Reviews:")
    print("--------------------------------------")
    indices[0].reverse()   
    for i in range(number):
        #print(indices[0][i])
        print(X.columns[indices[0][i]])

In [None]:
interpret(model_logreg,7)

In [None]:
def find_comment (target, label):
    for i in range(len(data)):
        if target in data.cut_comment[i] and data.sentiment[i] == label:
            print (data.comment[i])

In [None]:
find_comment(target = "西湖", label = 1)

## Snownlp

In [29]:
from snownlp import SnowNLP
def snow_result(comemnt):
    s = SnowNLP(comemnt)
    if s.sentiments >= 0.4:
        return 1
    else:
        return 0

In [30]:
%%time
data['snlp_result'] = data.comment.apply(snow_result)

counts = 0
for i in range(len(data)):
    if data.snlp_result[i] == data.sentiment[i]:
        counts+=1

print(counts/len(data))

0.6696666666666666
Wall time: 1min


## Naive Bayes

In [31]:
%%time
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train.feature.cpu(), train.label.cpu())
print(f'Train accuracy: {nb.score(train.feature.cpu(), train.label.cpu())}')
print(f'Test accuracy: {nb.score(test.feature.cpu(), test.label.cpu())}')

  return f(**kwargs)


Train accuracy: 0.8862962962962962
Test accuracy: 0.7333333333333333
Wall time: 625 ms


# MLP

In [32]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in):
        intermediate_vector = F.relu(self.fc1(x_in))
        prediction_vector = self.fc2(intermediate_vector)
        prediction_vector = torch.sigmoid(prediction_vector)
        
        return prediction_vector

In [33]:
model_MLP = MLP(train.n_features, 100, 1)
model_MLP.to(device)

MLP(
  (fc1): Linear(in_features=17289, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)

In [34]:
model_MLP.apply(weight_reset)

MLP(
  (fc1): Linear(in_features=17289, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)

In [35]:
%time performance_mlp = Train(model_MLP)

Epoch: 1/3, Step 50/270, loss = 0.7647, acc = 0.6063, acc_test = 0.6300
Epoch: 1/3, Step 100/270, loss = 0.6346, acc = 0.6193, acc_test = 0.6267
Epoch: 1/3, Step 150/270, loss = 0.5250, acc = 0.6289, acc_test = 0.6367
Epoch: 1/3, Step 200/270, loss = 0.5258, acc = 0.6907, acc_test = 0.6767
Epoch: 1/3, Step 250/270, loss = 0.5775, acc = 0.7519, acc_test = 0.7033
Epoch: 2/3, Step 50/270, loss = 0.6190, acc = 0.7752, acc_test = 0.7467
Epoch: 2/3, Step 100/270, loss = 0.4659, acc = 0.7944, acc_test = 0.7367
Epoch: 2/3, Step 150/270, loss = 0.5761, acc = 0.8015, acc_test = 0.7700
Epoch: 2/3, Step 200/270, loss = 0.3583, acc = 0.8104, acc_test = 0.7700
Epoch: 2/3, Step 250/270, loss = 0.8295, acc = 0.8278, acc_test = 0.7600
Epoch: 3/3, Step 50/270, loss = 0.5369, acc = 0.8459, acc_test = 0.7567
Epoch: 3/3, Step 100/270, loss = 0.6189, acc = 0.8456, acc_test = 0.7667
Epoch: 3/3, Step 150/270, loss = 0.5653, acc = 0.8659, acc_test = 0.7433
Epoch: 3/3, Step 200/270, loss = 0.4234, acc = 0.8826,

In [36]:
acc_test = evaluate(test,model_MLP)
print(f'Accuracy: {acc_test.item():.4f}')

Accuracy: 0.7533


# Optimizer

In [None]:
import matplotlib.pyplot as plt
performance1 = np.zeros(((1+n_iterations//batch)*3,9))
logreg = LogReg(train.n_features)
logreg.to(device)
optimizer = []
optimizer.append(torch.optim.SGD(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.ASGD(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.Adadelta(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.Adagrad(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.Adam(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.AdamW(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.Adamax(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.RMSprop(logreg.parameters(), lr=learning_rate))
optimizer.append(torch.optim.Rprop(logreg.parameters(), lr=learning_rate))

In [None]:
def training(ii):
    logreg.apply(weight_reset)
    acc_test = evaluate(test,logreg)
    print(f'Optimizer: {type(optimizer1)}, Accuracy: {acc_test.item():.4f}')
    num_epochs = 3
    pcount=0
    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(train_loader):
            # Run your training process
            y_pred = logreg(inputs)
            loss = criterion(y_pred, labels)
            # Backward pass and update
            loss.backward()
            optimizer1.step()
            # zero grad before new step
            optimizer1.zero_grad()
            if (i) % 50 == 0:
                acc_train = evaluate(train,logreg)
                acc_test = evaluate(test,logreg)
                performance1[pcount,ii]=acc_test.item() # Save performance for plots
                pcount +=1
                print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations}, loss = {loss.item():.4f}, acc = {acc_train.item():.4f}, acc_test = {acc_test.item():.4f}')
    #cc_test = evaluate(test,logreg)
    #rint(f'Optimizer: {type(optimizer1)}, Accuracy: {acc_test.item():.4f}')

In [None]:
for i in range(9):
    optimizer1 = optimizer[i]
    %time training(i)

In [None]:
performance = performance1[:45]

In [None]:
fig2 = plt.figure()
ax = fig2.add_subplot(1,1,1)
ax.plot(performance[:,0],label = 'SGD')
ax.plot(performance[:,1],label = 'ASGD')
ax.plot(performance[:,2],label = 'Adadelta')
ax.plot(performance[:,3],label = 'Adagrad')
ax.plot(performance[:,4],label = 'Adam')
ax.plot(performance[:,5],label = 'AdamW')
ax.plot(performance[:,6],label = 'Adamax')
ax.plot(performance[:,7],label = 'RMSprop')
ax.plot(performance[:,8],label = 'Rprop')
ax.legend()
#ax.set_xticks([0,2,4,6,8,10,12,14,16])
#ax.set_xticklabels(['0','100','200','300','400','500','600','700','800'])
ax.set_xlabel('Batch')
ax.set_ylabel('Test accuracy')

# Demo in presentation

In [None]:
vocab = X.columns.tolist()

In [None]:
features = np.zeros(len(vocab))
for word in jieba.cut(input("Enter a review: ")):
    if word in vocab:
        features[vocab.index(word)] += 1
featuretorch = torch.from_numpy(features.astype(np.float32)).cuda()
if logreg(featuretorch).round() == 0:
    print('negative')
else:
    print('positive')

In [None]:
for word in jieba.cut(input("Enter a sentence: ")):
    print(word)

In [None]:
print(" ".join(jieba.cut(input("Enter a sentence: "))))

In [37]:
data.shape

(3000, 5)

In [39]:
Xy.shape

(3000, 17290)