# Reading data and Preprocessing

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import torch
import copy

import warnings
warnings.filterwarnings("ignore")

In [3]:
def preprocess(filename):
    """
    desc:
    ====
    Takes in the filename, outputs a list of dictionaries. The dictionary will contain
    [review_id, raw_text, aspect_term, aspect_polarity]
    """
    tree = ET.parse(filename)
    root = tree.getroot()
    processed_data = [] # Store the final list of dictionaries
    
    for sentence in root.findall('sentence'):
        data = {}
        raw_text = sentence[0].text
        aspects = []
        aspect_polarity = []
        sentence_id = sentence.attrib['id']
        all_terms = sentence.find('aspectTerms')
        
        if all_terms: # If there are aspect terms
            all_terms_lst = all_terms.findall('aspectTerm')
            for ele in all_terms_lst: # Iterate through all aspect terms in senetence
                term = ele.get('term')
                aspects.append(term)
                term_polarity = ele.get('polarity')
                aspect_polarity.append(term_polarity)
        data['review_id'] = sentence_id
        data['raw_text'] = raw_text
        data['aspect_term'] = aspects
        data['aspect_polarity'] = aspect_polarity
        
        processed_data.append(data)
    return processed_data 

In [4]:
train_raw = preprocess('Restaurants_Train.xml')
train = pd.DataFrame(train_raw)
test_raw = preprocess('Restaurants_Test.xml')
test = pd.DataFrame(test_raw)

In [5]:
# import nltk
# from string import punctuation


# def pos_tag(texts):
#     text_tags = []
#     processed_text = []
#     for text in texts:
#         text = text.lower()
#         text = nltk.word_tokenize(text)
#         text = [word for word in text if word not in punctuation]
#         tag_words = nltk.pos_tag(text)
#         tags = []
#         sent = []
#         for i in tag_words:
#             if i[1] not in punctuation:
#                 tags.append(i[1])
#             if i[0] not in punctuation:
#                 sent.append(i[0])
#         text_tags.append(tags)
#         processed_text.append(sent)
#     return text_tags,processed_text

In [6]:
# text_tags, processed_text = pos_tag(train['raw_text'])
# df = pd.DataFrame()
# joined_text = []
# for lst in processed_text:
#     joined_text.append(' '.join(lst))
# df['text_full'] = joined_text
# df['text'] = processed_text
# df['pos_tags'] = text_tags

# df

In [7]:
import stanza
nlp = stanza.Pipeline('en', use_gpu = True)
# doc = nlp(list(df['text_full']))
def get_dependencies(pipeline,data):
    word_lst = []
    pos_tag = []
    all_tags = []
    word_joined = []
    for j in data:
        ind_tag = []
        temp_word = []
        temp_pos = []
        en_doc = pipeline(j)
        for i,sent in enumerate(en_doc.sentences):
            for word in sent.words:
                ind_tag.append((word.head,word.deprel))
                temp_word.append(word.text)
                temp_pos.append(word.xpos)
        all_tags.append(ind_tag)
        word_lst.append(temp_word)
        pos_tag.append(temp_pos)
        word_joined.append(' '.join(temp_word))
    df = pd.DataFrame()
    df['text_full'] = word_joined
    df['text'] = word_lst
    df['pos_tags'] = pos_tag
    df['dependencies'] = all_tags
    return df
df = get_dependencies(nlp,train['raw_text'])

2022-04-02 20:45:07 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-04-02 20:45:07 INFO: Use device: gpu
2022-04-02 20:45:07 INFO: Loading: tokenize
2022-04-02 20:45:10 INFO: Loading: pos
2022-04-02 20:45:10 INFO: Loading: lemma
2022-04-02 20:45:10 INFO: Loading: depparse
2022-04-02 20:45:10 INFO: Loading: sentiment
2022-04-02 20:45:11 INFO: Loading: constituency
2022-04-02 20:45:11 INFO: Loading: ner
2022-04-02 20:45:11 INFO: Done loading processors!


In [8]:
df

Unnamed: 0,text_full,text,pos_tags,dependencies
0,But the staff was so horrible to us .,"[But, the, staff, was, so, horrible, to, us, .]","[CC, DT, NN, VBD, RB, JJ, IN, PRP, .]","[(6, cc), (3, det), (6, nsubj), (6, cop), (6, ..."
1,"To be completely fair , the only redeeming fac...","[To, be, completely, fair, ,, the, only, redee...","[TO, VB, RB, JJ, ,, DT, JJ, JJ, NN, VBD, DT, N...","[(4, mark), (4, cop), (4, advmod), (12, advcl)..."
2,"The food is uniformly exceptional , with a ver...","[The, food, is, uniformly, exceptional, ,, wit...","[DT, NN, VBZ, RB, JJ, ,, IN, DT, RB, JJ, NN, W...","[(2, det), (5, nsubj), (5, cop), (5, advmod), ..."
3,Where Gabriela personaly greets you and recomm...,"[Where, Gabriela, personaly, greets, you, and,...","[WRB, NNP, RB, VBZ, PRP, CC, VBZ, PRP, WP, TO,...","[(4, advmod), (4, nsubj), (4, advmod), (0, roo..."
4,"For those that go once and do n't enjoy it , a...","[For, those, that, go, once, and, do, n't, enj...","[IN, DT, WDT, VBP, RB, CC, VBP, RB, VB, PRP, ,...","[(2, case), (16, obl), (4, nsubj), (2, acl:rel..."
...,...,...,...,...
3036,But that is highly forgivable .,"[But, that, is, highly, forgivable, .]","[CC, DT, VBZ, RB, JJ, .]","[(5, cc), (5, nsubj), (5, cop), (5, advmod), (..."
3037,"From the appetizers we ate , the dim sum and o...","[From, the, appetizers, we, ate, ,, the, dim, ...","[IN, DT, NNS, PRP, VBD, ,, DT, JJ, NN, CC, JJ,...","[(3, case), (3, det), (18, obl), (5, nsubj), (..."
3038,"When we arrived at 6:00 PM , the restaurant wa...","[When, we, arrived, at, 6:00, PM, ,, the, rest...","[WRB, PRP, VBD, IN, CD, NN, ,, DT, NN, VBD, RB...","[(3, mark), (3, nsubj), (12, advcl), (6, case)..."
3039,Each table has a pot of boiling water sunken i...,"[Each, table, has, a, pot, of, boiling, water,...","[DT, NN, VBZ, DT, NN, IN, VBG, NN, JJ, IN, PRP...","[(2, det), (3, nsubj), (0, root), (5, det), (3..."


# Rule-Based Aspect Extraction

In [9]:
# Bing Liu sentiment lexicon
from nltk.corpus import opinion_lexicon
opinion_words = opinion_lexicon.words()
opinion_words

['2-faced', '2-faces', 'abnormal', 'abolish', ...]

In [108]:
def extract_aspects1(data, op_lex):
    """
    desc: rule based aspect extraction
    ====
    Inputs: processed data, bingliu sentiment lexicons
    Outputs: 2-d array where each nested array
    """
    aspects = []
    best_aspects = []
    ## level 1
    for ind, row in data.iterrows():
        if ind%200 == 0:
            print(ind)
        temp_aspects = []
        temp_best_aspects = []
        word_lst = row.text
        pos_tags = row.pos_tags
        dependencies = row.dependencies
        for i in range(len(word_lst)):
            # (adj noun), adjective is not opinion, concat and add to aspect
            if pos_tags[i] in ['NN'] and i>0 and pos_tags[i-1] in ['JJ'] and word_lst[i-1].lower() not in op_lex:
                concat_word = ' '.join(word_lst[i-1:i+1])
                if concat_word not in temp_aspects:
                    temp_aspects.append(concat_word)
                    
            # (noun noun) concat and add to aspect
            if pos_tags[i] in ['NN'] and i>0 and pos_tags[i-1] in ['NN']:
                concat_word = ' '.join(word_lst[i-1:i+1])
                if concat_word not in temp_aspects:
                    temp_aspects.append(concat_word)
                    
            # (adj noun), adjective is opinion, add noun to aspect
            if pos_tags[i] in ['NN','NNS'] and i>0 and pos_tags[i-1] in ['JJ'] and word_lst[i-1].lower() in op_lex:
                if word_lst[i] not in temp_aspects:
                    temp_aspects.append(word_lst[i])
                if word_lst[i] not in temp_best_aspects:
                    temp_best_aspects.append(word_lst[i])
                    
                    

            # noun, dobj releationship with verb
            if pos_tags[i] in ['NN','NNS'] and dependencies[i][1] == 'dobj' and pos_tags[dependencies[i][0]-1] in ['VB','VBN']:
                if word_lst[i] not in temp_aspects:
                    temp_aspects.append(word_lst[i])
                    
            # noun, nsubj releationship with adjective
            if pos_tags[i] in ['NN','NNS'] and dependencies[i][1] == 'nsubj' and pos_tags[dependencies[i][0]-1] in ['JJ']:
                if word_lst[i] not in temp_aspects:
                    temp_aspects.append(word_lst[i])
            
#             noun, relationship with copula verb
#             if pos_tags[i] in ['NN'] and dependencies[i][1] == 'cop' and pos_tags[dependencies[i][0]-1] in ['VB','VBD','VBP']:
#                 if word_lst[i] not in temp_aspects:
#                     temp_aspects.append(word_lst[i])
#                 if word_lst[i] not in temp_best_aspects:
#                     temp_best_aspects.append(word_lst[i])
                    
#             sentence contains subject verb and word has an advmod or amod which is an opinion word
#             for j in range(len(pos_tags)-1):
#                 temp_tag = pos_tags[j:j+2]
#                 if temp_tag[0] in ['PRP','PRP$'] and temp_tag[1] in ['VB','VBD','VBG','VBN']:
#                     if pos_tags[i] in ['NN'] and dependencies[i][1] in ['amod','advmod'] and word_lst[dependencies[i][0]-1] in op_lex:
#                         if word_lst[i] not in temp_aspects:
#                             temp_aspects.append(word_lst[i])
#                         if word_lst[i] not in temp_best_aspects:
#                             temp_best_aspects.append(word_lst[i])
            
        aspects.append(temp_aspects)
        best_aspects.append(temp_best_aspects)
    return aspects, best_aspects
    ## level 2
def extract_aspects2(data, aspects):
    aspects_level2 = copy.deepcopy(aspects)
    for ind, row in data.iterrows():
        if ind%200 == 0:
            print(ind)
        word_lst = row.text
        pos_tags = row.pos_tags
        dependencies = row.dependencies
        for word in aspects[ind]:
            if len(word.split())==1:
                word_ind = word_lst.index(word)
                if pos_tags[dependencies[word_ind][0]-1] in ['NN','NNS']:
                    aspects_level2[ind].append(word_lst[dependencies[word_ind][0]-1])
    
    return aspects_level2
                        

In [109]:
aspects, best_aspects = extract_aspects1(df, opinion_words)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000


In [110]:
aspects2 = extract_aspects2(df,aspects)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000


In [111]:
def prune(aspect2,best_aspects):
    # prune all double words
    pruned_aspects = []
    for i in range(len(aspect2)):
        temp_aspects = copy.deepcopy(aspect2[i])
        for word in aspect2[i]:
            if len(word.split()) == 1:
                for word_lst in aspect2[i]:
                    if word in word_lst.split() and len(word_lst.split())>1 and word not in best_aspects[i] and word in temp_aspects:
                        temp_aspects.remove(word)
        pruned_aspects.append(temp_aspects)
    return pruned_aspects

In [112]:
pruned_aspects = prune(aspects2,best_aspects)
actual_tags = list(train.aspect_term)

In [113]:
def prec_rec(aspects,labels):
#     indiv_prec = [] # ratio of labels we got correct
#     indiv_recall = [] # ratio of correct labels against real labels
#     for i in range(len(aspects)):
#         count = 0
#         if len(aspects[i]) == 0 and len(labels[i]) == 0:
# #             indiv_prec.append(1)
# #             indiv_recall.append(1)
#             pass
#         elif len(aspects[i]) == 0 and len(labels[i]) > 0:
#             indiv_prec.append(1)
#             indiv_recall.append(0)
#         elif len(aspects[i]) > 0 and len(labels[i]) == 0:
#             indiv_prec.append(0)
#             indiv_recall.append(1)
#         else:
#             for term in aspects[i]:
#                 if term in labels[i]:
#                     count+=1
#             indiv_prec.append(count/len(aspects))
#             indiv_recall.append(count/len(labels))
    tp = 0
    fn = 0
    fp = 0
    for i in range(len(aspects)):
        if len(aspects[i]) == 0 and len(labels[i]) == 0:
            pass
        elif len(aspects[i]) == 0 and len(labels[i]) > 0:
            fn += len(labels[i])
        elif len(aspects[i]) > 0 and len(labels[i]) == 0:
            fp += len(aspects[i])
        else:
            for term in aspects[i]:
                if term in labels[i]:
                    tp += 1
                else:
                    fp += 1
            for term in labels[i]:
                if term in aspects[i]:
                    pass
                else:
                    fn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    return precision, recall

In [114]:
precision, recall = prec_rec(pruned_aspects,actual_tags)
print('Precision: ', precision)
print('Recall: ', recall)

Precision:  0.46554412223261615
Recall:  0.39909115209836943


In [115]:
def extract_single(aspects):
    aspect_lst = []
    for lst in aspects:
        temp_lst = []
        for word_lst in lst:
            if len(word_lst.split())==1:
                temp_lst.append(word_lst)
        aspect_lst.append(temp_lst)
    return aspect_lst
single_pruned_aspects = extract_single(pruned_aspects)
single_labels = extract_single(actual_tags)
def extract_single_double(aspects):
    aspect_lst = []
    for lst in aspects:
        temp_lst = []
        for word_lst in lst:
            if len(word_lst.split())<=2:
                temp_lst.append(word_lst)
        aspect_lst.append(temp_lst)
    return aspect_lst
single_pruned_aspects = extract_single(pruned_aspects)
single_labels = extract_single(actual_tags)
double_pruned_aspects = extract_single_double(pruned_aspects)
double_labels = extract_single_double(actual_tags)

In [116]:
precision,recall = prec_rec(single_pruned_aspects, single_labels)
print('Precision: ', precision)
print('Recall: ', recall)

Precision:  0.6348025186033199
Recall:  0.3913196894848271


In [117]:
precision,recall = prec_rec(double_pruned_aspects, double_labels)
print('Precision: ', precision)
print('Recall: ', recall)

Precision:  0.46554412223261615
Recall:  0.4320023148148148


In [99]:
compare_labels = pd.DataFrame()
compare_labels['pred'] = pruned_aspects
compare_labels['true'] = actual_tags
compare_labels[:50]

Unnamed: 0,pred,true
0,[staff],[staff]
1,"[factor, food]",[food]
2,"[food, kitchen]","[food, kitchen, menu]"
3,[],[]
4,[],[]
5,"[food, perks]","[food, perks]"
6,[],[]
7,[waiters],"[orrechiete with sausage and chicken, waiters,..."
8,"[taste, texture, taste]",[Bagels]
9,[food],[food]


In [23]:
print(df.loc[2].text_full)
print(df.loc[2].text)
print(df.loc[2].pos_tags)
print(df.loc[2].dependencies)

The food is uniformly exceptional , with a very capable kitchen which will proudly whip up whatever you feel like eating , whether it 's on the menu or not .
['The', 'food', 'is', 'uniformly', 'exceptional', ',', 'with', 'a', 'very', 'capable', 'kitchen', 'which', 'will', 'proudly', 'whip', 'up', 'whatever', 'you', 'feel', 'like', 'eating', ',', 'whether', 'it', "'s", 'on', 'the', 'menu', 'or', 'not', '.']
['DT', 'NN', 'VBZ', 'RB', 'JJ', ',', 'IN', 'DT', 'RB', 'JJ', 'NN', 'WDT', 'MD', 'RB', 'VB', 'RP', 'WDT', 'PRP', 'VBP', 'IN', 'VBG', ',', 'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NN', 'CC', 'RB', '.']
[(2, 'det'), (5, 'nsubj'), (5, 'cop'), (5, 'advmod'), (0, 'root'), (5, 'punct'), (11, 'case'), (11, 'det'), (10, 'advmod'), (11, 'amod'), (5, 'obl'), (15, 'nsubj'), (15, 'aux'), (15, 'advmod'), (11, 'acl:relcl'), (15, 'compound:prt'), (15, 'obj'), (19, 'nsubj'), (17, 'acl:relcl'), (21, 'mark'), (19, 'advcl'), (28, 'punct'), (28, 'mark'), (28, 'nsubj'), (28, 'cop'), (28, 'case'), (28, 'det'), (19

In [24]:
import nltk
nltk.pos_tag(['fried','rice'])

[('fried', 'VBN'), ('rice', 'NN')]

In [25]:
flat_pred = [word for sublist in single_pruned_aspects for word in sublist]
flat_test = [word for sublist in actual_tags for word in sublist]

In [26]:
pd.value_counts(np.array(flat_pred)).head(10)

food          226
service       145
place          86
staff          40
prices         38
restaurant     33
atmosphere     29
Service        22
meal           21
experience     20
dtype: int64

In [27]:
pd.value_counts(np.array(flat_test)).head(10)

food          357
service       206
place          64
prices         60
menu           57
staff          56
dinner         55
atmosphere     49
pizza          43
table          41
dtype: int64

In [28]:
df.iloc[49]

text_full                        The fried rice is amazing here .
text                     [The, fried, rice, is, amazing, here, .]
pos_tags                            [DT, VBN, NN, VBZ, JJ, RB, .]
dependencies    [(3, det), (3, amod), (5, nsubj), (5, cop), (0...
Name: 49, dtype: object

In [34]:
count_dic = {}
for lst in actual_tags:
    for aspects in lst:
        leng = len(aspects.split())
        if leng not in count_dic.keys():
            count_dic[leng] = 1
        else:
            count_dic[leng] += 1
        if leng == 15:
            print( aspects)
count_dic

egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork


{1: 2786,
 5: 26,
 3: 178,
 2: 622,
 4: 60,
 6: 8,
 13: 2,
 7: 2,
 10: 2,
 8: 1,
 9: 4,
 19: 1,
 15: 1}

In [63]:
len_dic = {}
for lst in actual_tags:
    len_lst = len(lst)
    if len_lst not in len_dic.keys():
        len_dic[len_lst] = 1
    else:
        len_dic[len_lst] +=1
len_dic # total = 3041


0.9999999999999999

# Baseline

In [77]:
import string
def baseline_model(data):
    aspect_lst = []
    for i in data.text:
        temp_wordlst = [word for word in i if word not in string.punctuation]
        temp_aspects =[]
        num_aspects = np.random.choice(np.arange(0, 10), p=[1020/3041, 1023/3041, 572/3041, 269/3041, 104/3041, 29/3041,15/3041,5/3041,3/3041,1/3041])
        for j in range(num_aspects):
            len_aspects = np.random.choice(np.arange(1,9), p=[2786/3683,622/3683,178/3683,60/3683,26/3683,8/3683,2/3683,1/3683])
            if len_aspects>=len(temp_wordlst):
                temp_aspects.append(' '.join(temp_wordlst))
            else:
                max_ind = len(temp_wordlst)-len_aspects
                if max_ind-1 == 0:
                    start_ind = 0
                else:
                    start_ind = np.random.randint(0,max_ind-1)
                aspect_joined = " ".join(temp_wordlst[start_ind:start_ind+len_aspects])
                temp_aspects.append(aspect_joined)
                temp_wordlst = temp_wordlst[:start_ind] + temp_wordlst[start_ind+len_aspects:]
        aspect_lst.append(temp_aspects)
    return aspect_lst

In [79]:
baseline_extraction = baseline_model(df)

In [92]:
precision, recall = prec_rec(baseline_extraction,actual_tags)
print('Precision: ', precision)
print('Recall: ', recall)

Precision:  0.05486628067273228
Recall:  0.05385656292286874
