# Sentiment Analysis

Import Necessary Libraries

In [1]:
import pandas as pd
import spacy
import xml.etree.ElementTree as ET
import csv
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import load_model
from keras.models import Sequential
from keras.initializers import RandomUniform, glorot_uniform
from keras.layers import Dense, Activation, Bidirectional, LSTM, Dropout, Embedding
from keras.preprocessing.text import Tokenizer
nlp = spacy.load('en')

Using TensorFlow backend.


Parse .xml Data

In [None]:
# parse .xml data to a .csv format

path_train = r"ABSA16_Laptops_Train_English_SB2.xml"
path_test = r"EN_LAPT_SB2_TEST.xml"

def get_list(path):
    tree = ET.parse(path)
    root = tree.getroot()
    text_list = []
    opinion_list = []
    for review in root.findall("Review"):
        text_string = ""
        opinion_inner_list = []
        for sent in review.findall("./sentences/sentence"):
            text_string=text_string + " " + sent.find("text").text
        text_list.append(text_string)
        for opinion in review.findall("./Opinions/Opinion"):
            opinion_dict = {
                    opinion.get("category").replace("#", "_"): opinion.get("polarity")
            }
            opinion_inner_list.append(opinion_dict)
        opinion_list.append(opinion_inner_list)
    return text_list, opinion_list

train_text_list, train_opinion_list = get_list(path_train)
test_text_list, test_opinion_list = get_list(path_test)

# start fileout
csvfile = 'ABSA16_Laptops_TrainTest_English_SB2.csv'
fileout = open(csvfile,'wt')
csvwrite = csv.writer(fileout)
csvwrite.writerow(('Id', 'Text', 'Sentiment', 'Aspect'))

def parse_lists(text_list, op_list, txtadjust, aspcount):
    txtcount = 0
    for text in text_list:
        opinions = op_list[txtcount]
        printcount = txtcount + txtadjust  # for the test set
        txtcount += 1
        for op in opinions:
            for aspect in op:
                senti = op[aspect]
                csvwrite.writerow((printcount, text, senti, aspect))
                aspcount += 1
    return(printcount, aspcount)

(traintotaltxt, traintotalasp) = parse_lists(train_text_list, train_opinion_list, 0, 0)
(testtotaltxt, testtotalasp) = parse_lists(test_text_list, test_opinion_list, traintotaltxt, traintotalasp)

print('%d texts processed including %d aspects; saved to %s' % (testtotaltxt, testtotalasp, csvfile))
fileout.close()

xtree = et.parse("ABSA16_Laptops_Train_English_SB2.xml")
xroot = xtree.getroot()

Read data, adjust column names, clean if necessary

In [2]:
trainData = pd.read_csv('dataset.csv')
trainData.columns = ['Id', 'Text', 'Sentiment', 'Aspect']

In [4]:
# function to handle similar aspects (e.g.: LAPTOP_GENERAL, LAPTOP_USABILITY)
def betterLabels(entry):
    if entry.startswith('COMPANY'):
        return 'General'
    elif entry.startswith('LAPTOP'):
        return 'General'
    elif entry.startswith('MOUSE'):
        return 'Mouse'
    elif entry.startswith('KEYBOARD'):
        return 'Keyboard'
    elif entry.startswith('OS'):
        return 'OS'
    elif entry.startswith('DISPLAY'):
        return 'Display'
    elif entry.startswith('SOFTWARE'):
        return 'Software'
    elif entry.startswith('BATTERY'):
        return 'Battery'
    elif entry.startswith('HARD'):
        return 'Hardware'
    elif entry.startswith('GRAPHICS'):
        return 'Graphics'
    elif entry.startswith('POWER_SUPPLY'):
        return 'Power_Supply'
    elif entry.startswith('PORTS'):
        return 'Ports'
    elif entry.startswith('CPU'):
        return 'CPU'
    elif entry.startswith('MULTIMEDIA'):
        return 'Multimedia'
    elif entry.endswith('PRICE'):
        return 'Price'
    elif entry.startswith('SUPPORT'):
        return 'Support'
    elif entry.startswith('MOTHERBOARD'):
        return 'Hardware'
    elif entry.startswith('OPTICAL'):
        return 'Hardware'
    elif entry.startswith('FANS'):
        return 'Hardware'
    elif entry.endswith('GENERAL'):
        return 'General'
    elif entry.endswith('QUALITY'):
        return 'General'
    return entry

trainData['Aspect'] = trainData['Aspect'].apply(betterLabels)

In [5]:
trainData.drop_duplicates(inplace=True)
trainData2 = trainData[trainData['Aspect'] != 'General']

Aspect Extraction

In [3]:
# collect all aspect terms per sentence together

trainData.Text = trainData.Text.str.lower()
aspect_terms = []
for review in nlp.pipe(trainData.Text):
    chunks = [(chunk.root.text) for chunk in review.noun_chunks if chunk.root.pos_ == 'NOUN']
    aspect_terms.append(' '.join(chunks))
trainData['aspect_terms'] = aspect_terms
trainData.head(10)

Unnamed: 0,Id,Text,Sentiment,Aspect,aspect_terms
0,0,most everything is fine with this machine: sp...,positive,LAPTOP_GENERAL,machine speed capacity thing resolution screen...
1,0,most everything is fine with this machine: sp...,positive,LAPTOP_OPERATION_PERFORMANCE,machine speed capacity thing resolution screen...
2,0,most everything is fine with this machine: sp...,positive,HARD_DISC_DESIGN_FEATURES,machine speed capacity thing resolution screen...
3,0,most everything is fine with this machine: sp...,positive,LAPTOP_QUALITY,machine speed capacity thing resolution screen...
4,0,most everything is fine with this machine: sp...,negative,DISPLAY_QUALITY,machine speed capacity thing resolution screen...
5,1,"i love the size, keyboard, the functions. i d...",positive,LAPTOP_DESIGN_FEATURES,size keyboard functions complaint quality pric...
6,1,"i love the size, keyboard, the functions. i d...",positive,KEYBOARD_GENERAL,size keyboard functions complaint quality pric...
7,1,"i love the size, keyboard, the functions. i d...",positive,LAPTOP_OPERATION_PERFORMANCE,size keyboard functions complaint quality pric...
8,1,"i love the size, keyboard, the functions. i d...",positive,LAPTOP_GENERAL,size keyboard functions complaint quality pric...
9,1,"i love the size, keyboard, the functions. i d...",positive,LAPTOP_USABILITY,size keyboard functions complaint quality pric...


In [151]:
# sentences to numpy arrays

vocab_size = 6000 # We set a maximum size for the vocabulary
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(trainData2.Text)
aspect_tokenized = pd.DataFrame(tokenizer.texts_to_matrix(trainData2.aspect_terms))

In [152]:
# aspects to numpy arrays

label_encoder = LabelEncoder()
integer_category = label_encoder.fit_transform(trainData2.Aspect)
dummy_category = to_categorical(integer_category)

In [154]:
# simple model for aspect detection

aspectSimple = Sequential()
aspectSimple.add(Dense(512, input_shape=(6000,), activation='relu'))
aspectSimple.add(Dense(256, activation='relu'))
aspectSimple.add(Dense(128, activation='relu'))
aspectSimple.add(Dense(128, activation='relu'))
aspectSimple.add(Dense(15, activation='softmax'))
aspectSimple.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [6]:
# complex model for aspect detection, not used for inference

aspect_categories_model = Sequential()

e_init = RandomUniform(-0.01, 0.01, seed=1)
init = glorot_uniform(seed=1)

aspect_categories_model.add(Embedding(input_dim=6000,
  output_dim=32, embeddings_initializer=e_init,
  mask_zero=True))
aspect_categories_model.add(LSTM(units=100, kernel_initializer=init,
  dropout=0.2, recurrent_dropout=0.2))  # 100 memory
aspect_categories_model.add(Dense(units=16, kernel_initializer=init,
  activation='sigmoid'))
aspect_categories_model.compile(loss='binary_crossentropy', optimizer='adam',
  metrics=['acc'])

In [155]:
aspectSimple.fit(aspect_tokenized, dummy_category, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x7f99b0372a10>

In [10]:
# aspect_categories_model.fit(aspect_tokenized, dummy_category, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f9aac18aad0>

Sentiment Extraction

In [12]:
# collect all sentiment terms per sentence together

sentiment_terms = []
for review in nlp.pipe(trainData['Text']):
        if review.is_parsed:
            sentiment_terms.append(' '.join([token.lemma_ for token in review if (not token.is_stop and not token.is_punct and (token.pos_ == "ADJ" or token.pos_ == "VERB"))]))
        else:
            sentiment_terms.append('')  
trainData['sentiment_terms'] = sentiment_terms
trainData.head(10)

Unnamed: 0,Id,Text,Sentiment,Aspect,aspect_terms,sentiment_terms
0,0,most everything is fine with this machine: sp...,positive,General,machine speed capacity thing resolution screen...,fine understand high high available
2,0,most everything is fine with this machine: sp...,positive,Hardware,machine speed capacity thing resolution screen...,fine understand high high available
4,0,most everything is fine with this machine: sp...,negative,Display,machine speed capacity thing resolution screen...,fine understand high high available
5,1,"i love the size, keyboard, the functions. i d...",positive,General,size keyboard functions complaint quality pric...,love easy use good good recommend
6,1,"i love the size, keyboard, the functions. i d...",positive,Keyboard,size keyboard functions complaint quality pric...,love easy use good good recommend
12,2,i love this product because it is toshiba and...,positive,General,product camera connect downside product,love buy install easy use compatible recommend
15,2,i love this product because it is toshiba and...,negative,General,product camera connect downside product,love buy install easy use compatible recommend
16,3,i bought this laptop was the worst laptop i'v...,negative,General,laptop laptop alot money product nightmare com...,buy bad buy spend have deal bad send fix perfe...
18,3,i bought this laptop was the worst laptop i'v...,negative,Support,laptop laptop alot money product nightmare com...,buy bad buy spend have deal bad send fix perfe...
20,4,"so far, a great product. high price tag, howe...",positive,General,product tag,great high try learn use


In [14]:
sentiment_tokenized = pd.DataFrame(tokenizer.texts_to_matrix(trainData.sentiment_terms))

label_encoder_2 = LabelEncoder()
integer_sentiment = label_encoder_2.fit_transform(trainData.Sentiment)
dummy_sentiment = to_categorical(integer_sentiment)

In [40]:
# # complex model for sentiment detection, not used for inference

sentimentComplex = Sequential()
e_init = RandomUniform(-0.01, 0.01, seed=1)
init = glorot_uniform(seed=1)

sentimentComplex.add(Embedding(input_dim=6000,
  output_dim=64, embeddings_initializer=e_init,
  mask_zero=True))
sentimentComplex.add(LSTM(units=100, kernel_initializer=init,
  dropout=0.2, recurrent_dropout=0.2))  # 100 memory
sentimentComplex.add(Dense(units=4, kernel_initializer=init,
  activation='sigmoid'))
sentimentComplex.compile(loss='binary_crossentropy', optimizer='adam',
  metrics=['acc'])

In [87]:
# simple model for sentiment detection

sentiment_model = Sequential()
sentiment_model.add(Dense(256, input_shape=(6000,), activation='relu'))
sentiment_model.add(Dense(128, activation='relu'))
sentiment_model.add(Dense(128, activation='relu'))
sentiment_model.add(Dense(4, activation='softmax'))

sentiment_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [88]:
sentiment_model.fit(sentiment_tokenized, dummy_sentiment, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f99d46e74d0>

Inference

In [180]:
test_sentences = [
    'worst mouse ever',
    'hardware is fantastic',
    'display is a disappointment',
    'battery is crap',
    'display is lovely'    
]

for sentence in test_sentences:
    
    # aspect detection
    
    chunks = [(chunk.root.text) for chunk in nlp(sentence).noun_chunks if chunk.root.pos_ == 'NOUN']
    new_review_aspect_terms = ' '.join(chunks)
    new_review_aspect_tokenized = tokenizer.texts_to_matrix([new_review_aspect_terms])

    aspect = label_encoder.inverse_transform(aspectSimple.predict_classes(new_review_aspect_tokenized))
    
    # sentiment detection
    
    chunks = [(chunk.root.text) for chunk in nlp(sentence).noun_chunks if chunk.root.pos_ == 'NOUN']
    new_review_aspect_terms = ' '.join(chunks)
    new_review_aspect_tokenized = tokenizer.texts_to_matrix([new_review_aspect_terms])
    sentiment = label_encoder_2.inverse_transform(sentiment_model.predict_classes(new_review_aspect_tokenized))
    print('Sentence #' + str(test_sentences.index(sentence) + 1) + " expresses a " + sentiment[0] + ' opinion about the ' + aspect[0])

Sentence #1 expresses a negative opinion about the Mouse
Sentence #2 expresses a positive opinion about the Hardware
Sentence #3 expresses a negative opinion about the Display
Sentence #4 expresses a negative opinion about the Battery
Sentence #5 expresses a positive opinion about the Display
