In [29]:
#taken from https://github.com/firascherif/ABSA-BERT-pair/blob/master/generate/data_utils_sentihood.py

# Reference: https://github.com/liufly/delayed-memory-update-entnet

from __future__ import absolute_import

import json
import operator
import os
import re
import sys
import xml.etree.ElementTree

# import nltk
import numpy as np


def load_task(data_dir, aspect2idx):
    in_file = os.path.join(data_dir, 'sentihood-train.json')
    train = parse_sentihood_json(in_file)
    in_file = os.path.join(data_dir, 'sentihood-dev.json')
    dev = parse_sentihood_json(in_file)
    in_file = os.path.join(data_dir, 'sentihood-test.json')
    test = parse_sentihood_json(in_file)

    train = convert_input(train, aspect2idx)
    train_aspect_idx = get_aspect_idx(train, aspect2idx)
    train = tokenize(train)
    dev = convert_input(dev, aspect2idx)
    dev_aspect_idx = get_aspect_idx(dev, aspect2idx)
    dev = tokenize(dev)
    test = convert_input(test, aspect2idx)
    test_aspect_idx = get_aspect_idx(test, aspect2idx)
    test = tokenize(test)

    return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx)


# def get_aspect_idx(data, aspect2idx):
#     ret = []
#     for _, _, _, aspect, _ in data:
#         ret.append(aspect2idx[aspect])
#     assert len(data) == len(ret)
#     return np.array(ret)


def parse_sentihood_json(in_file):
    with open(in_file) as f:
        data = json.load(f)
    ret = []
    for d in data:
        text = d['text']
        sent_id = d['id']
        opinions = []
        targets = set()
        for opinion in d['opinions']:
            sentiment = opinion['sentiment']
            aspect = opinion['aspect']
            target_entity = opinion['target_entity']
            targets.add(target_entity)
            opinions.append((target_entity, aspect, sentiment))
        ret.append((sent_id, text, opinions))
    return ret


def convert_input(data, all_aspects):
    ret = []
    for sent_id, text, opinions in data:
        for target_entity, aspect, sentiment in opinions:
            if aspect not in all_aspects:
                continue
            ret.append((sent_id, text, target_entity, aspect, sentiment))
        assert 'LOCATION1' in text
        targets = set(['LOCATION1'])
        if 'LOCATION2' in text:
            targets.add('LOCATION2')
        for target in targets:
            aspects = set([a for t, a, _ in opinions if t == target])
            none_aspects = [a for a in all_aspects if a not in aspects]
            for aspect in none_aspects:
                ret.append((sent_id, text, target, aspect, 'None'))
    return ret


def tokenize(data):
    ret = []
    for sent_id, text, target_entity, aspect, sentiment in data:
        new_text = nltk.word_tokenize(text)
        new_aspect = aspect.split('-')
        ret.append((sent_id, new_text, target_entity, new_aspect, sentiment))
    return ret


def all_aspects(data, aspect2idx):
    aspect_logits = []
    logit = np.zeros(12) #taking care of 0 % 12 
    for i , (sent_id, text, target, aspect, sentiment) in enumerate(data, start = 1):
        if i % 12 == 0:
            aspect_logits.append(logit)
            logit = np.zeros(12)
        if sentiment != "None":
            logit[aspect2idx[aspect]] = 1
    return aspect_logits

In [30]:
dev = parse_sentihood_json("sentihood-dev.json")

In [31]:
aspect2idx = {
    'general': 0,
    'price': 1,
    'transit-location': 2,
    'safety': 3,
    'live' : 4,
    'quiet' : 5,
    'dining' : 6,
    'nightlife' : 7,
    'touristy' : 8,
    'shopping' : 9,
    'green-culture' : 10,
    'multicultural' : 11,
}

In [32]:
dev[5]

(322,
 '( LOCATION1 is the nearest tube station , just about a 4 min walk ) Youre lucky youre going to be studying in London',
 [])

In [33]:
converted_dev = convert_input(dev, aspect2idx)

In [37]:
converted_dev[0]

(302,
 ' LOCATION1 is just a normal area that happens to have an alternative market',
 'LOCATION1',
 'shopping',
 'Positive')

In [8]:
dev_aspect_id = get_aspect_idx(converted_dev, aspect2idx)

In [1]:
import torch
import torch.nn as nn
import torchvision.utils
import torch.nn.functional as F
import torch.optim as optim
# from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets, models
from torchvision.utils import save_image
import os

from dataloader import SentihoodDataset
from model.nn_model import LSTMModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "data")

In [3]:
from torch.nn.utils.rnn import pad_sequence
def pad_collate(batch):
    batch_embedded_text, a, b, c, d = zip(*batch)
    lens = [x.shape[0] for x in batch_embedded_text]
    
    batch_embedded_text = pad_sequence(batch_embedded_text, batch_first=True, padding_value=0)
    
    return batch_embedded_text, torch.tensor(lens, dtype=torch.float), a, b, c, d

In [4]:

train_dataset = SentihoodDataset(
        data_dir,
        dataset_type='train',
        transform=None,
        condition_on_number = False
    )

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=0, collate_fn = pad_collate)

model = LSTMModel(768, 100, 0.0, device, 768)

In [5]:
embedded_text, lens, target_index, aspect_logit, c_aspect, sentiment_one_hot = next(iter(train_loader))

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoded = model.encode(embedded_text.to(device), lens)

RuntimeError: Input and parameter tensors are not at the same device, found input tensor at cuda:0 and parameter tensor at cpu

In [16]:
embedded_text.shape

torch.Size([10, 39, 768])

In [17]:
lens

[31, 39, 17, 23, 4, 22, 16, 22, 30, 32]

In [21]:
target_index[0][0]

3

In [6]:
aspect_logit

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [7]:
sentiment_one_hot

array([0., 0., 1.])

In [16]:
a.shape

torch.Size([1, 25, 300])

In [19]:
a.squeeze(0).shape

torch.Size([25, 300])

In [19]:
import torch.nn as nn
import torch

In [None]:
k = [[1,2,3], [4,5]]
l = [[1], [2]]