In [4]:
from transformers import AutoTokenizer
import re, string
import json
import spacy
import torch

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [71]:
idx = 9

In [72]:
data = json.load(open('../data/train_laptop_raw.json'))
sample = data[idx]
sample

{'id': '38',
 'text': 'In the shop, these MacBooks are encased in a soft rubber enclosure - so you will never know about the razor edge until you buy it, get it home, break the seal and use it (very clever con).',
 'aspects': [{'term': 'rubber enclosure',
   'from': 50,
   'to': 66,
   'polarity': 'positive'},
  {'term': 'edge', 'from': 108, 'to': 112, 'polarity': 'negative'}]}

In [73]:
text = sample['text']
aspects = sample['aspects']
term, from_, to, polarity = aspects[0]['term'], aspects[0]['from'], aspects[0]['to'], aspects[0]['polarity']

text, term, from_, to, polarity

('In the shop, these MacBooks are encased in a soft rubber enclosure - so you will never know about the razor edge until you buy it, get it home, break the seal and use it (very clever con).',
 'rubber enclosure',
 50,
 66,
 'positive')

In [74]:
tokenized = tokenizer(text, padding=True, max_length=100, truncation=True, return_offsets_mapping=True)
tokenized

{'input_ids': [101, 1130, 1103, 4130, 117, 1292, 6603, 2064, 9753, 1116, 1132, 4035, 14083, 1181, 1107, 170, 2991, 9579, 19904, 118, 1177, 1128, 1209, 1309, 1221, 1164, 1103, 20015, 2652, 1235, 1128, 4417, 1122, 117, 1243, 1122, 1313, 117, 2549, 1103, 9438, 1105, 1329, 1122, 113, 1304, 13336, 14255, 114, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 2), (3, 6), (7, 11), (11, 12), (13, 18), (19, 22), (22, 23), (23, 26), (26, 27), (28, 31), (32, 34), (34, 38), (38, 39), (40, 42), (43, 44), (45, 49), (50, 56), (57, 66), (67, 68), (69, 71), (72, 75), (76, 80), (81, 86), (87, 91), (92, 97), (98, 101), (102, 107), (108, 112), (113, 118), (119, 122), (123, 

In [75]:
input_ids = tokenized['input_ids']
tokenizer.decode(input_ids)

'[CLS] In the shop, these MacBooks are encased in a soft rubber enclosure - so you will never know about the razor edge until you buy it, get it home, break the seal and use it ( very clever con ). [SEP]'

In [76]:
print(tokenizer.tokenize(text, add_special_tokens=True))

['[CLS]', 'In', 'the', 'shop', ',', 'these', 'Mac', '##B', '##ook', '##s', 'are', 'en', '##case', '##d', 'in', 'a', 'soft', 'rubber', 'enclosure', '-', 'so', 'you', 'will', 'never', 'know', 'about', 'the', 'razor', 'edge', 'until', 'you', 'buy', 'it', ',', 'get', 'it', 'home', ',', 'break', 'the', 'seal', 'and', 'use', 'it', '(', 'very', 'clever', 'con', ')', '.', '[SEP]']


In [77]:
def generate_tags(sample, tokenizer):
    """
    Generate tags for each token in the text
    """
    tags = []
    aspect_idx = 0
    started = False
    all_aspects_covered = False

    text, aspects = sample['text'], sample['aspects']
    aspects = sorted(aspects, key=lambda x: x['from'])

    tokens = tokenizer.tokenize(text, add_special_tokens=True)
    tokenized = tokenizer(text, padding=True, truncation=True, return_offsets_mapping=True)

    for token, offset in zip(tokens, tokenized['offset_mapping']):

        from_ = offset[0]; to = offset[1]
        if from_ == to == 0:
            tag = 'Q'  # represents the ignore tag for special tokens

        elif all_aspects_covered or to < aspects[aspect_idx]['from']: # either all aspects are covered or the current token is before the current aspect
            tag= 'O' # represents the tag for non-aspect tokens
            started = False

        elif from_ >= aspects[aspect_idx]['from'] and to <= aspects[aspect_idx]['to']: # If the current token boundaries are within the aspect boundaries
            if not started:
                started = True
                tag = 'B'
            else:
                if token.startswith("##"):
                    tag = 'X' # represents the tag for subtokens
                else:
                    tag = 'I' # represents the tag for continuing aspect

        if not all_aspects_covered and to >= aspects[aspect_idx]['to']: # If the current token boundaries are after the aspect boundaries
            aspect_idx += 1 # move to the next aspect
            if aspect_idx >= len(aspects):
                all_aspects_covered = True
            started = False

        tags.append(tag)

    return tags, tokens, tokenized

In [78]:
tags, tokens, tokenized = generate_tags(sample, tokenizer)
print(list(zip(tokens, tags)))

[('[CLS]', 'Q'), ('In', 'O'), ('the', 'O'), ('shop', 'O'), (',', 'O'), ('these', 'O'), ('Mac', 'O'), ('##B', 'O'), ('##ook', 'O'), ('##s', 'O'), ('are', 'O'), ('en', 'O'), ('##case', 'O'), ('##d', 'O'), ('in', 'O'), ('a', 'O'), ('soft', 'O'), ('rubber', 'B'), ('enclosure', 'I'), ('-', 'O'), ('so', 'O'), ('you', 'O'), ('will', 'O'), ('never', 'O'), ('know', 'O'), ('about', 'O'), ('the', 'O'), ('razor', 'O'), ('edge', 'B'), ('until', 'O'), ('you', 'O'), ('buy', 'O'), ('it', 'O'), (',', 'O'), ('get', 'O'), ('it', 'O'), ('home', 'O'), (',', 'O'), ('break', 'O'), ('the', 'O'), ('seal', 'O'), ('and', 'O'), ('use', 'O'), ('it', 'O'), ('(', 'O'), ('very', 'O'), ('clever', 'O'), ('con', 'O'), (')', 'O'), ('.', 'O'), ('[SEP]', 'Q')]


In [54]:
new_sample = {
    "text": "This is a nice lappy headphones camera set!!!",
    "aspects": [
        {"term": "lappy", "from": 15, "to": 20, "polarity": "positive"},
        {"term": "headphones", "from": 21, "to": 31, "polarity": "positive"},
        {"term": "camera set", "from": 32, "to": 42, "polarity": "positive"}
    ]
}

In [56]:
tags, tokens, tokenized = generate_tags(new_sample, tokenizer)
for token, tag in zip(tokens, tags):
    print(token, tag)

[CLS] Q
This O
is O
a O
nice O
lap B
##py X
head B
##phones X
camera B
set I
! O
! O
! O
[SEP] Q


In [57]:
new_sample = {
    "text": "My name is Rishabh Gupta!",
    "aspects": [
        {"term": "Rishabh Gupta", "from": 11, "to": 24, "polarity": "positive"},
    ]
}

In [58]:
tags, tokens, tokenized = generate_tags(new_sample, tokenizer)
for token, tag in zip(tokens, tags):
    print(token, tag)

[CLS] Q
My O
name O
is O
R B
##ish X
##ab X
##h X
Gupta I
! O
[SEP] Q
