### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import spacy
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


### Analyze Dataset

In [2]:
train_dataset = '/home/ayush/Graduate_Study/Natural_Language_Processing_Course/Assignment_2/exercise2/data/traindata.csv'
dev_dataset = '/home/ayush/Graduate_Study/Natural_Language_Processing_Course/Assignment_2/exercise2/data/devdata.csv'

In [3]:
col_names = ['Polarity','Aspect_Category','Specific_Target_Aspect_Term','Character_Offset','Sentence']
train_df = pd.read_csv(train_dataset,sep='\t',names=col_names)
dev_df = pd.read_csv(dev_dataset,sep='\t',names=col_names)

spacy_parser = spacy.load('en')
vocab_size = 8000
num_aspect_categories = 12 # There are 12 Aspect Categories
num_sentiments = 3 # Positive, Negative and Neutral

#Drop the character Offset Dataframe
train_df = train_df.drop(columns=['Character_Offset'])
dev_df = dev_df.drop(columns=['Character_Offset'])

In [4]:
print('Train Data Shape: ',train_df.shape)
print('Dev Data Shape: ',dev_df.shape)

Train Data Shape:  (1503, 4)
Dev Data Shape:  (376, 4)


In [5]:
print(train_df.head(30))

    Polarity           Aspect_Category Specific_Target_Aspect_Term  \
0   positive          AMBIENCE#GENERAL                     seating   
1   positive          AMBIENCE#GENERAL                   trattoria   
2   positive              FOOD#QUALITY                        food   
3   negative           SERVICE#GENERAL                       STAFF   
4   positive        FOOD#STYLE_OPTIONS                        menu   
5   positive              FOOD#QUALITY                        tuna   
6   negative           SERVICE#GENERAL                       staff   
7   negative           SERVICE#GENERAL                     service   
8   positive        FOOD#STYLE_OPTIONS                    BBQ ribs   
9   positive          AMBIENCE#GENERAL                       place   
10  negative              FOOD#QUALITY         appetizer of olives   
11  negative              FOOD#QUALITY                       foods   
12  positive            DRINKS#QUALITY                      drinks   
13  positive        

### Extract Sentiment for Aspect

In [6]:
Specific_Target_Sentiment_Term = []
for review in spacy_parser.pipe(train_df['Sentence']):
        if review.is_parsed:
            Specific_Target_Sentiment_Term.append(' '.join([token.lemma_ for token in review if (not token.is_stop and not token.is_punct and (token.pos_ == "ADJ" or token.pos_ == "VERB"))]))
        else:
            Specific_Target_Sentiment_Term.append('')  
train_df['Specific_Target_Sentiment_Term'] = Specific_Target_Sentiment_Term
train_df.head(10)

Unnamed: 0,Polarity,Aspect_Category,Specific_Target_Aspect_Term,Sentence,Specific_Target_Sentiment_Term
0,positive,AMBIENCE#GENERAL,seating,short and sweet – seating is great:it's romant...,short sweet great be romantic cozy private
1,positive,AMBIENCE#GENERAL,trattoria,This quaint and romantic trattoria is at the t...,quaint romantic
2,positive,FOOD#QUALITY,food,The have over 100 different beers to offer thi...,different offer thi happy delicious recommend
3,negative,SERVICE#GENERAL,STAFF,THIS STAFF SHOULD BE FIRED.,should be
4,positive,FOOD#STYLE_OPTIONS,menu,"The menu looked great, and the waiter was very...",look great nice come average
5,positive,FOOD#QUALITY,tuna,The tuna and wasabe potatoes are excellent.,excellent
6,negative,SERVICE#GENERAL,staff,The whole set up is truly unprofessional and I...,unprofessional wish good current great
7,negative,SERVICE#GENERAL,service,"sometimes i get bad food and bad service, some...",bad bad good good bad
8,positive,FOOD#STYLE_OPTIONS,BBQ ribs,This place has the best Chinese style BBQ ribs...,good chinese
9,positive,AMBIENCE#GENERAL,place,Great place to relax and enjoy your dinner,great relax enjoy


### Building Bag of Words Representation

In [7]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_df.Sentence)

### Encode the Aspect Categories

In [8]:
#label_encoder_aspect = LabelEncoder()
#aspect_integer_category = label_encoder_aspect.fit_transform(train_df.Aspect_Category)
#aspect_dummy_category = to_categorical(aspect_integer_category)

### Building the Sentiment Analysis Model

In [9]:
sentiment_model = Sequential()
sentiment_model.add(Dense(512, input_shape=(vocab_size,), activation='relu'))
sentiment_model.add(Dense(num_sentiments, activation='softmax'))
sentiment_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
tokenized_sentiment = pd.DataFrame(tokenizer.texts_to_matrix(train_df.Specific_Target_Sentiment_Term))

In [11]:
label_encoder_sentiment = LabelEncoder()
sentiment_integer_category = label_encoder_sentiment.fit_transform(train_df.Polarity)
sentiment_dummy_category = to_categorical(sentiment_integer_category)

In [12]:
sentiment_model.fit(tokenized_sentiment, sentiment_dummy_category, epochs=40, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f7e555626d8>

### Testing the ABSA Model on Validation Dataset

In [13]:
# Aspect preprocessing
#val_aspect_terms = pd.DataFrame(tokenizer.texts_to_matrix(val_df.Specific_Target_Aspect_Term))

dev_aspect_category = list(dev_df.Aspect_Category)
                             
# Sentiment preprocessing
Dev_Specific_Target_Sentiment_Term = []
for review in spacy_parser.pipe(dev_df['Sentence']):
        if review.is_parsed:
            Dev_Specific_Target_Sentiment_Term.append(' '.join([token.lemma_ for token in review if (not token.is_stop and not token.is_punct and (token.pos_ == "ADJ" or token.pos_ == "VERB"))]))
        else:
            Dev_Specific_Target_Sentiment_Term.append('') 
            
Dev_Specific_Target_Sentiment_Term = pd.DataFrame(tokenizer.texts_to_matrix(Dev_Specific_Target_Sentiment_Term))

# Models output
#Dev_aspect_categories = label_encoder_aspect.inverse_transform(train_df.Aspect_Category)
Dev_predict_sentiment = label_encoder_sentiment.inverse_transform(sentiment_model.predict_classes(Dev_Specific_Target_Sentiment_Term))

for i in range(len(dev_aspect_category)):
    print("Review " + str(i+1) + " is expressing a  " + Dev_predict_sentiment[i] + " opinion about " + dev_df.Aspect_Category[i])

Review 1 is expressing a  positive opinion about LOCATION#GENERAL
Review 2 is expressing a  negative opinion about RESTAURANT#GENERAL
Review 3 is expressing a  positive opinion about FOOD#QUALITY
Review 4 is expressing a  positive opinion about SERVICE#GENERAL
Review 5 is expressing a  positive opinion about DRINKS#QUALITY
Review 6 is expressing a  negative opinion about AMBIENCE#GENERAL
Review 7 is expressing a  negative opinion about RESTAURANT#GENERAL
Review 8 is expressing a  positive opinion about SERVICE#GENERAL
Review 9 is expressing a  positive opinion about SERVICE#GENERAL
Review 10 is expressing a  negative opinion about RESTAURANT#MISCELLANEOUS
Review 11 is expressing a  positive opinion about SERVICE#GENERAL
Review 12 is expressing a  positive opinion about FOOD#STYLE_OPTIONS
Review 13 is expressing a  positive opinion about DRINKS#QUALITY
Review 14 is expressing a  positive opinion about FOOD#PRICES
Review 15 is expressing a  positive opinion about FOOD#QUALITY
Review 16 i

  if diff:


### Evaluation of the Predicted Sentiment Based on GroundTruth Data

In [14]:
def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(0, n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    print("\nACCURACY: %.2f" % (acc * 100))

In [16]:
#Predicted Polarity or Sentiment
slabels = list(Dev_predict_sentiment)

# GroundTruth Polarity or Sentiment
glabels = list(dev_df.Polarity)

print(len(slabels))
print(len(glabels))

376
376


In [17]:
eval_list(glabels,slabels)


ACCURACY: 74.20
