In [None]:
import xml.etree.ElementTree as ET
from enum import Enum
import pandas as pd

restaurant = ET.parse('Restaurants_Train.xml')
sentences = restaurant.getroot()

In [None]:
class Cols(Enum):
    # sentence ID from the data, probably not much use
    ID = 1
    # unprocessed raw text
    RawText = 2
    # list of aspect terms (str)
    AspectTerms = 3
    # list of polarity, must be same length with AspectTerms list. Possible value is positive, negative, neutral and conflict
    AspectTermsPolarity = 4
    # the span of the aspect terms in the un processed raw text, same size as the aspect terms list
    AspectTermsSpan = 5
    # tokenized raw text
    RawTextTokenize = 6
    # tokenized raw text with label in iob format
    RawTextWithLabelIOB = 7

    TextWithPOS = 8

    TextWithPOSReplaced = 9
    

In [None]:
data = {Cols.ID.name:[],Cols.RawText.name:[],Cols.AspectTerms.name:[],Cols.AspectTermsPolarity.name:[],Cols.AspectTermsSpan.name:[]}

for sentence in sentences:
    term_list = []
    polarity_list = []
    span_list = []
    data[Cols.ID.name].append(sentence.attrib['id'])
    for taa in sentence: # taa is the level of <text>, <aspectTerms>, <aspectCategories>
        if(taa.tag == 'text'):
            data[Cols.RawText.name].append(taa.text)
            #print(sentence.tag, taa.tag, ':', taa.text)
        elif(taa.tag == 'aspectTerms'): 
           for aspectTerms in taa:
                #print(sentence.tag, taa.tag, ':', aspectTerms.attrib) # attrib is dictionary
                term = aspectTerms.attrib['term']
                polarity = aspectTerms.attrib['polarity']
                idx_from = aspectTerms.attrib['from']
                idx_to = aspectTerms.attrib['to']
                term_list.append(term)
                polarity_list.append(polarity)
                span_list.append((idx_from,idx_to))
    data[Cols.AspectTerms.name].append(term_list)
    data[Cols.AspectTermsPolarity.name].append(polarity_list)
    data[Cols.AspectTermsSpan.name].append(span_list)

df=pd.DataFrame.from_dict(data)
display(df)
# df.to_csv("pre_processed.csv")

Unnamed: 0,ID,RawText,AspectTerms,AspectTermsPolarity,AspectTermsSpan
0,3121,But the staff was so horrible to us.,[staff],[negative],"[(8, 13)]"
1,2777,"To be completely fair, the only redeeming fact...",[food],[positive],"[(57, 61)]"
2,1634,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[(4, 8), (55, 62), (141, 145)]"
3,2534,Where Gabriela personaly greets you and recomm...,[],[],[]
4,583,"For those that go once and don't enjoy it, all...",[],[],[]
...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],[],[]
3040,777,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[(9, 19), (32, 39), (61, 66), (103, 107)]"
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],[],[]
3042,671,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[(5, 10), (17, 37), (99, 104), (114, 124), (13..."


In [None]:
import nltk
df[Cols.RawTextTokenize.name] = df[Cols.RawText.name].map(nltk.word_tokenize)
display(df)

Unnamed: 0,ID,RawText,AspectTerms,AspectTermsPolarity,AspectTermsSpan,RawTextTokenize
0,3121,But the staff was so horrible to us.,[staff],[negative],"[(8, 13)]","[But, the, staff, was, so, horrible, to, us, .]"
1,2777,"To be completely fair, the only redeeming fact...",[food],[positive],"[(57, 61)]","[To, be, completely, fair, ,, the, only, redee..."
2,1634,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[(4, 8), (55, 62), (141, 145)]","[The, food, is, uniformly, exceptional, ,, wit..."
3,2534,Where Gabriela personaly greets you and recomm...,[],[],[],"[Where, Gabriela, personaly, greets, you, and,..."
4,583,"For those that go once and don't enjoy it, all...",[],[],[],"[For, those, that, go, once, and, do, n't, enj..."
...,...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],[],[],"[But, that, is, highly, forgivable, .]"
3040,777,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[(9, 19), (32, 39), (61, 66), (103, 107)]","[From, the, appetizers, we, ate, ,, the, dim, ..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],[],[],"[When, we, arrived, at, 6:00, PM, ,, the, rest..."
3042,671,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[(5, 10), (17, 37), (99, 104), (114, 124), (13...","[Each, table, has, a, pot, of, boiling, water,..."


In [None]:
# getting the label IOB with the complicated way, i.e. using the span. (but less likely to be wrong? I am not sure)
# using the IOB2 format instead if IOB, thinks that will be more useful
def getIOB(tokenized_sentence, span , polarity):
    if len(span) == 0:
        return [(word,"O",None) for word in tokenized_sentence]
    else:
        result = []
        char_idx = 0
        word_idx = 0
        current_in = False
        for i in range(len(span)):
            idx_from,idx_to = span[i]
            while char_idx < int(idx_from):
                result.append((tokenized_sentence[word_idx],"O",None))
                char_idx +=1 + len(tokenized_sentence[word_idx])
                word_idx +=1
            while char_idx < int(idx_to):
                if not current_in:
                    current_in = True
                    result.append((tokenized_sentence[word_idx],"B",polarity[i]))
                else:
                    result.append((tokenized_sentence[word_idx],"I",polarity[i]))
                char_idx +=1 + len(tokenized_sentence[word_idx])
                word_idx +=1
            current_in = False
        if word_idx < len(tokenized_sentence)-1:
            result = result + [(word,"O",None) for word in tokenized_sentence[word_idx:len(tokenized_sentence)-1]]
        return(result)

df[Cols.RawTextWithLabelIOB.name] = df.apply(lambda row: getIOB(row[Cols.RawTextTokenize.name],row[Cols.AspectTermsSpan.name],row[Cols.AspectTermsPolarity.name]),axis=1)
display(df)
df.to_csv("pre_processed.csv")

Unnamed: 0,ID,RawText,AspectTerms,AspectTermsPolarity,AspectTermsSpan,RawTextTokenize,RawTextWithLabelIOB
0,3121,But the staff was so horrible to us.,[staff],[negative],"[(8, 13)]","[But, the, staff, was, so, horrible, to, us, .]","[(But, O, None), (the, O, None), (staff, B, ne..."
1,2777,"To be completely fair, the only redeeming fact...",[food],[positive],"[(57, 61)]","[To, be, completely, fair, ,, the, only, redee...","[(To, O, None), (be, O, None), (completely, O,..."
2,1634,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[(4, 8), (55, 62), (141, 145)]","[The, food, is, uniformly, exceptional, ,, wit...","[(The, O, None), (food, B, positive), (is, O, ..."
3,2534,Where Gabriela personaly greets you and recomm...,[],[],[],"[Where, Gabriela, personaly, greets, you, and,...","[(Where, O, None), (Gabriela, O, None), (perso..."
4,583,"For those that go once and don't enjoy it, all...",[],[],[],"[For, those, that, go, once, and, do, n't, enj...","[(For, O, None), (those, O, None), (that, O, N..."
...,...,...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],[],[],"[But, that, is, highly, forgivable, .]","[(But, O, None), (that, O, None), (is, O, None..."
3040,777,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[(9, 19), (32, 39), (61, 66), (103, 107)]","[From, the, appetizers, we, ate, ,, the, dim, ...","[(From, O, None), (the, O, None), (appetizers,..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],[],[],"[When, we, arrived, at, 6:00, PM, ,, the, rest...","[(When, O, None), (we, O, None), (arrived, O, ..."
3042,671,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[(5, 10), (17, 37), (99, 104), (114, 124), (13...","[Each, table, has, a, pot, of, boiling, water,...","[(Each, O, None), (table, B, neutral), (has, O..."


In [None]:
import contractions
def fix_contractions_pos(text):
    contractions.fix(text)
    return nltk.pos_tag(nltk.word_tokenize(text))

df[Cols.TextWithPOS.name] = df[Cols.RawText.name].map(fix_contractions_pos)

In [None]:
display(df)

Unnamed: 0,ID,RawText,AspectTerms,AspectTermsPolarity,AspectTermsSpan,RawTextTokenize,RawTextWithLabelIOB,TextWithPOS
0,3121,But the staff was so horrible to us.,[staff],[negative],"[(8, 13)]","[But, the, staff, was, so, horrible, to, us, .]","[(But, O, None), (the, O, None), (staff, B, ne...","[(But, CC), (the, DT), (staff, NN), (was, VBD)..."
1,2777,"To be completely fair, the only redeeming fact...",[food],[positive],"[(57, 61)]","[To, be, completely, fair, ,, the, only, redee...","[(To, O, None), (be, O, None), (completely, O,...","[(To, TO), (be, VB), (completely, RB), (fair, ..."
2,1634,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[(4, 8), (55, 62), (141, 145)]","[The, food, is, uniformly, exceptional, ,, wit...","[(The, O, None), (food, B, positive), (is, O, ...","[(The, DT), (food, NN), (is, VBZ), (uniformly,..."
3,2534,Where Gabriela personaly greets you and recomm...,[],[],[],"[Where, Gabriela, personaly, greets, you, and,...","[(Where, O, None), (Gabriela, O, None), (perso...","[(Where, WRB), (Gabriela, NNP), (personaly, VB..."
4,583,"For those that go once and don't enjoy it, all...",[],[],[],"[For, those, that, go, once, and, do, n't, enj...","[(For, O, None), (those, O, None), (that, O, N...","[(For, IN), (those, DT), (that, WDT), (go, VBP..."
...,...,...,...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],[],[],"[But, that, is, highly, forgivable, .]","[(But, O, None), (that, O, None), (is, O, None...","[(But, CC), (that, DT), (is, VBZ), (highly, RB..."
3040,777,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[(9, 19), (32, 39), (61, 66), (103, 107)]","[From, the, appetizers, we, ate, ,, the, dim, ...","[(From, O, None), (the, O, None), (appetizers,...","[(From, IN), (the, DT), (appetizers, NNS), (we..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],[],[],"[When, we, arrived, at, 6:00, PM, ,, the, rest...","[(When, O, None), (we, O, None), (arrived, O, ...","[(When, WRB), (we, PRP), (arrived, VBD), (at, ..."
3042,671,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[(5, 10), (17, 37), (99, 104), (114, 124), (13...","[Each, table, has, a, pot, of, boiling, water,...","[(Each, O, None), (table, B, neutral), (has, O...","[(Each, DT), (table, NN), (has, VBZ), (a, DT),..."


In [None]:
pos_replace = {"CD":"<NUM>","NNP":"<PNOUN>","NNPS":"<PNOUN>"}
pos_remove = []

def replace_remove(x):
    ls = []
    for word,tag in x:
        if tag not in pos_remove:
            if tag in pos_replace:
                ls.append((pos_replace[tag],tag))
            else:
                ls.append((word,tag))
    return ls

df[Cols.TextWithPOSReplaced.name] = df[Cols.TextWithPOS.name].map(replace_remove)
display(df)

Unnamed: 0,ID,RawText,AspectTerms,AspectTermsPolarity,AspectTermsSpan,RawTextTokenize,RawTextWithLabelIOB,TextWithPOS,TextWithPOSReplaced
0,3121,But the staff was so horrible to us.,[staff],[negative],"[(8, 13)]","[But, the, staff, was, so, horrible, to, us, .]","[(But, O, None), (the, O, None), (staff, B, ne...","[(But, CC), (the, DT), (staff, NN), (was, VBD)...","[(But, CC), (the, DT), (staff, NN), (was, VBD)..."
1,2777,"To be completely fair, the only redeeming fact...",[food],[positive],"[(57, 61)]","[To, be, completely, fair, ,, the, only, redee...","[(To, O, None), (be, O, None), (completely, O,...","[(To, TO), (be, VB), (completely, RB), (fair, ...","[(To, TO), (be, VB), (completely, RB), (fair, ..."
2,1634,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]","[(4, 8), (55, 62), (141, 145)]","[The, food, is, uniformly, exceptional, ,, wit...","[(The, O, None), (food, B, positive), (is, O, ...","[(The, DT), (food, NN), (is, VBZ), (uniformly,...","[(The, DT), (food, NN), (is, VBZ), (uniformly,..."
3,2534,Where Gabriela personaly greets you and recomm...,[],[],[],"[Where, Gabriela, personaly, greets, you, and,...","[(Where, O, None), (Gabriela, O, None), (perso...","[(Where, WRB), (Gabriela, NNP), (personaly, VB...","[(Where, WRB), (<PNOUN>, NNP), (personaly, VBZ..."
4,583,"For those that go once and don't enjoy it, all...",[],[],[],"[For, those, that, go, once, and, do, n't, enj...","[(For, O, None), (those, O, None), (that, O, N...","[(For, IN), (those, DT), (that, WDT), (go, VBP...","[(For, IN), (those, DT), (that, WDT), (go, VBP..."
...,...,...,...,...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],[],[],"[But, that, is, highly, forgivable, .]","[(But, O, None), (that, O, None), (is, O, None...","[(But, CC), (that, DT), (is, VBZ), (highly, RB...","[(But, CC), (that, DT), (is, VBZ), (highly, RB..."
3040,777,"From the appetizers we ate, the dim sum and ot...","[appetizers, dim sum, foods, food]","[positive, positive, positive, positive]","[(9, 19), (32, 39), (61, 66), (103, 107)]","[From, the, appetizers, we, ate, ,, the, dim, ...","[(From, O, None), (the, O, None), (appetizers,...","[(From, IN), (the, DT), (appetizers, NNS), (we...","[(From, IN), (the, DT), (appetizers, NNS), (we..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],[],[],"[When, we, arrived, at, 6:00, PM, ,, the, rest...","[(When, O, None), (we, O, None), (arrived, O, ...","[(When, WRB), (we, PRP), (arrived, VBD), (at, ...","[(When, WRB), (we, PRP), (arrived, VBD), (at, ..."
3042,671,Each table has a pot of boiling water sunken i...,"[table, pot of boiling water, meats, vegetable...","[neutral, neutral, neutral, neutral, neutral, ...","[(5, 10), (17, 37), (99, 104), (114, 124), (13...","[Each, table, has, a, pot, of, boiling, water,...","[(Each, O, None), (table, B, neutral), (has, O...","[(Each, DT), (table, NN), (has, VBZ), (a, DT),...","[(Each, DT), (table, NN), (has, VBZ), (a, DT),..."


In [None]:
df.to_csv("pre_processed.csv")

In [None]:
print(nltk.pos_tag(["I","eat","fried","rice"]))

[('I', 'PRP'), ('eat', 'VBP'), ('fried', 'JJ'), ('rice', 'NN')]
