## Import necessary libraries

In [33]:
import os
import re
import pandas as pd
import nltk
import itertools
import string
import xml.etree.ElementTree as et
import numpy as np


#use current directory as location for all files

os.chdir(r"./")

## Read dataset

In [21]:
#SemEval dataset
filename = "Restaurants_Train.xlsx"
data = pd.read_excel(filename)

In [22]:
data.head(5)

Unnamed: 0,sentences,sentence,id,text,aspectTerms,aspectTerm,term,polarity,from,to,aspectCategories,aspectCategory,category,polarity.1
0,,,3121.0,But the staff was so horrible to us.,,,staff,negative,8.0,13.0,,,service,negative
1,,,2777.0,"To be completely fair, the only redeeming fact...",,,food,positive,57.0,61.0,,,food,positive
2,,,,,,,,,,,,,anecdotes/miscellaneous,negative
3,,,1634.0,"The food is uniformly exceptional, with a very...",,,food,positive,4.0,8.0,,,food,positive
4,,,,,,,kitchen,positive,55.0,62.0,,,,


In [23]:
raw_data.tail(5)

Unnamed: 0,sentences,sentence,id,text,aspectTerms,aspectTerm,term,polarity,from,to,aspectCategories,aspectCategory,category,polarity.1
4908,,,,,,,meats,neutral,99.0,104.0,,,,
4909,,,,,,,vegetables,neutral,114.0,124.0,,,,
4910,,,,,,,rice,neutral,130.0,134.0,,,,
4911,,,,,,,glass noodles,neutral,139.0,152.0,,,,
4912,,,617.0,I am going to the mid town location next.,,,,,,,,,anecdotes/miscellaneous,neutral


## Preliminary analysis

In [24]:
keys = pd.DataFrame(data.keys())
keys.index += 1
keys = keys.rename(columns={0:"Keys"})
keys

Unnamed: 0,Keys
1,sentences
2,sentence
3,id
4,text
5,aspectTerms
6,aspectTerm
7,term
8,polarity
9,from
10,to


In [25]:
reviews = data["text"].dropna()
print("Number of Reviews:", len(reviews))

Number of Reviews: 3044


In [26]:
ex_asp = pd.unique(data["category"].dropna())
print("Explicit Aspects:", ", ".join(ex_asp))

Explicit Aspects: service, food, anecdotes/miscellaneous, price, ambience


In [29]:
ex_asp_freq_table = data.groupby("category").size().reset_index()
ex_asp_freq_table = ex_asp_freq_table.rename(columns={"category":"Ex.Asp", 0:"Num"})
ex_asp_freq_table.index +=1
ex_asp_freq_table

Unnamed: 0,Ex.Asp,Num
1,ambience,432
2,anecdotes/miscellaneous,1133
3,food,1233
4,price,319
5,service,597


In [30]:
asp_polarity_freq_table = data.groupby(["category", "polarity.1"]).size().reset_index()
asp_polarity_freq_table = asp_polarity_freq_table.rename(columns={"category":"Ex.Asp", "polarity.1":"Polarity", 0:"Num"})
asp_polarity_freq_table.index +=1
asp_polarity_freq_table

Unnamed: 0,Ex.Asp,Polarity,Num
1,ambience,conflict,47
2,ambience,negative,98
3,ambience,neutral,24
4,ambience,positive,263
5,anecdotes/miscellaneous,conflict,30
6,anecdotes/miscellaneous,negative,199
7,anecdotes/miscellaneous,neutral,357
8,anecdotes/miscellaneous,positive,547
9,food,conflict,67
10,food,negative,209


## Data Preprocessing

In [36]:
def _removePunc(item): #replace with space
    return re.sub(r'[^\w\s]',' ',item)

In [37]:
filename = "Restaurants_Train.xml"

table = []
row = [np.NaN] * 7

for event, node in et.iterparse(filename, events=('start', 'end')):
    
    if node.tag == "text":
        row[0] = node.text
    elif node.tag == "aspectTerms" and event == "start":
        row[1] = []
        row[2] = []
        row[3] = []
        row[4] = []
    elif node.tag == "aspectTerm" and event == "start":
        row[1].append(_removePunc(node.attrib.get("term")))
        row[2].append(node.attrib.get("polarity"))
        row[3].append(int(node.attrib.get("from")))
        row[4].append(int(node.attrib.get("to")))
    elif node.tag == "aspectCategories" and event == "start":
        row[5] = []
        row[6] = []
    elif node.tag == "aspectCategory" and event == "start":
        row[5].append(node.attrib.get("category"))
        row[6].append(node.attrib.get("polarity"))
    elif node.tag == "aspectCategories" and event == "end":
        table.append(row)
        row = [np.NaN] * 7

dfcols = ['review', 'term', 'termPolarity', 'startIndex', 'endIndex','aspect', 'aspectPolarity']
clean_data = pd.DataFrame(table, columns=dfcols)

In [38]:
clean_data.head(5)

Unnamed: 0,review,term,termPolarity,startIndex,endIndex,aspect,aspectPolarity
0,But the staff was so horrible to us.,[staff],[negative],[8],[13],[service],[negative]
1,"To be completely fair, the only redeeming fact...",[food],[positive],[57],[61],"[food, anecdotes/miscellaneous]","[positive, negative]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[4, 55, 141]","[8, 62, 145]",[food],[positive]
3,Where Gabriela personaly greets you and recomm...,,,,,[service],[positive]
4,"For those that go once and don't enjoy it, all...",,,,,[anecdotes/miscellaneous],[positive]


In [39]:
clean_data.tail(5)

Unnamed: 0,review,term,termPolarity,startIndex,endIndex,aspect,aspectPolarity
3039,But that is highly forgivable.,,,,,[anecdotes/miscellaneous],[positive]
3040,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[9, 32, 61, 103]","[19, 39, 66, 107]",[food],[positive]
3041,"When we arrived at 6:00 PM, the restaurant was...",,,,,[anecdotes/miscellaneous],[neutral]
3042,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[5, 17, 99, 114, 130, 139]","[10, 37, 104, 124, 134, 152]",[food],[neutral]
3043,I am going to the mid town location next.,,,,,[anecdotes/miscellaneous],[neutral]


#### Encode labels into a categorical vector
Create a function to find the word index after splitting the review into list

In [40]:
# index a term in the string list (with customization)
def _index(str, term):
    for i in range(len(str)):
        if term in str[i]: # More lenient as not need for exactly the same word
            return i
    raise("Word not found")

In [41]:
def _findWordIndex(s, terms):
    
    # Terminate if no idx given
    if isinstance(terms, float):
        return [-1]

    # Temp var
    new_str = _removePunc(s).split()
    word_idx = []
    
    for term in terms:
        for subterm in term.split():
            word_idx.append(_index(new_str, subterm))
            
    return word_idx

Make a new column for categorical terms

In [42]:
# Make new column for categorical terms
categoricalTerms = []
terms = clean_data["term"]
reviews = clean_data["review"]

for i in range(clean_data.shape[0]):
    categoricalTerms.append(_findWordIndex(reviews[i], terms[i]))

clean_data = clean_data.assign(categoricalTerms=pd.Series(categoricalTerms))

In [43]:
clean_data.head(5)

Unnamed: 0,review,term,termPolarity,startIndex,endIndex,aspect,aspectPolarity,categoricalTerms
0,But the staff was so horrible to us.,[staff],[negative],[8],[13],[service],[negative],[2]
1,"To be completely fair, the only redeeming fact...",[food],[positive],[57],[61],"[food, anecdotes/miscellaneous]","[positive, negative]",[10]
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[4, 55, 141]","[8, 62, 145]",[food],[positive],"[1, 9, 25]"
3,Where Gabriela personaly greets you and recomm...,,,,,[service],[positive],[-1]
4,"For those that go once and don't enjoy it, all...",,,,,[anecdotes/miscellaneous],[positive],[-1]


#### Testing the correctness of _findWordIndex function
Activate the block by turning it to "Code" if needed

# -----------
# Testing 123
# -----------
#### Loading necessary libraries

In [44]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D
from keras.layers.core import Reshape, Flatten, Activation
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


#### Preprocess the data

#### Split data into training, testing, validation sets

In [45]:
reviews = clean_data['review']
labels = clean_data['categoricalTerms']
### rmb to remove conflict
x_train, x_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.4, random_state=999)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size = 0.5, random_state=999)

#### Tokenize text of the training data with keras preprocessing text function

In [46]:
# Set maximum number of words to be embedded
NUM_WORDS = 100000

# Set maximum length of a sentence
MAX_LEN = 65

# Define/Load Tokenize text function
tokenizer = Tokenizer(num_words=NUM_WORDS, filters=string.punctuation)

# Fit the function on the text
tokenizer.fit_on_texts(x_train)

# Count number of unique tokens
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 3417 unique tokens.


In [47]:
# Convert train and val to sequence
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_valid= tokenizer.texts_to_sequences(x_val)

In [48]:
# Limit size of train/val to 65 and pad the sequence
x_train = pad_sequences(sequences_train,maxlen=MAX_LEN)
x_val = pad_sequences(sequences_valid,maxlen=x_train.shape[1])

# Use multilabelbinarizer to encode categorical terms into one-hot vector
mlb = MultiLabelBinarizer(classes=[i for i in range(65)])
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

# Printing shape
print('Shape of X train and X validation tensor:', x_train.shape,x_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

KeyError: -1

## Word Embedding
#### Using pretrained Word2Vec model from Google or Amazon
#### Create 300-dimensional vectors

In [None]:
#word_vectors = KeyedVectors.load_word2vec_format('./AmazonWE/sentic2vec.csv')
word_vectors = pd.read_csv('./AmazonWE/sentic2vec.csv', encoding = "ISO-8859-1", header=None)

In [None]:
word_vectors.head(5)

In [None]:
EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,(NUM_WORDS))

window_size = 5

embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM*window_size))

for word, i in word_index.items():
    if i < vocabulary_size:
        embedding_vector = word_vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
#         else:
#             vec = np.zeros(EMBEDDING_DIM)
#             embedding_matrix[i] = vec

In [None]:
del(word_vectors)

In [None]:
# Define embedding function using the embedding_matrix
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True,
                            input_length=MAX_LEN)

In [None]:
del(embedding_matrix)

#### Build CNN with EarlyStopping
What is unknown so far?
- the structure of fully-connected layer
- batch size

In [None]:
sequence_length = x_train.shape[1]
filter_sizes = [2,3]
num_filters = [100, 50]

# Create model
model = Sequential()

# Add layers
model.add(embedding_layer)
model.add(Reshape((sequence_length,EMBEDDING_DIM,1)))
model.add(Conv2D(num_filters[0],(filter_sizes[0], filter_sizes[0]), activation='tanh'))
model.add(MaxPooling2D((2,2)))
model.add(Conv2D(num_filters[1],(filter_sizes[1], filter_sizes[1]), activation='tanh'))
model.add(MaxPooling2D((2,2)))
model.add(Flatten())
# model.add(Dense(100, activation='relu'))
# model.add(Dense(100, activation='relu'))
model.add(Dense(MAX_LEN, activation='softmax', kernel_regularizer=regularizers.l2(0.01)))
# model.add(Activation('softmax'))

In [None]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

# summarize the model
print(model.summary())

In [None]:
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=60,
                    validation_data=(x_val, y_val))