In [1]:
import os
import re
import json
import string

import gensim
import numpy as np
import pandas as pd
from argparse import Namespace
from collections import Counter

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn import init
from torch.nn.parameter import Parameter
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import codecs

%matplotlib inline
%env PYTHONHASHSEED=0

env: PYTHONHASHSEED=0


# Function Definition

## Word2Vec model
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [2]:
def parse_xml(path):
    root = ET.parse(path).getroot()
    return root


def get_text_from_xml(xml):
    texts = []
    texts_merged = []

    labels = []
    labels_merged = []

    for review in xml.findall('Review'):
        _texts = ''
        _labels_all = {}

        for sentence in review.find('sentences').findall('sentence'):
            text = sentence.find('text').text
            texts.append(text)
            _texts += ' {}'.format(text)


            op = sentence.find('Opinions')
            if op is not None:
                _labels = {}
                for label in sentence.find('Opinions').findall('Opinion'):
                    target_term = label.get('target')
                    target_aspect = label.get('category')
                    _labels[target_aspect] = target_term

                labels.append(_labels)
                _labels_all.update(_labels)
            else:
                labels.append({})

        texts_merged.append(_texts)
        labels_merged.append(_labels_all)

    return texts, texts_merged, labels, labels_merged

def parseSentence(line):
    lmtzr = WordNetLemmatizer()    
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem

def preprocess_train(train_path, preprocessed_fold):
    f = codecs.open(train_path, 'r', 'utf-8')
    out = codecs.open(os.path.join(preprocessed_fold, 'train.txt'), 'w', 'utf-8')

    for line in f:
        tokens = parseSentence(line)
        if len(tokens) > 0:
            out.write(' '.join(tokens)+'\n')

def preprocess_test(args, test_path, test_lab_path, preprocessed_fold):
    # only keep sentences with single aspect label that are defined in args.labels
    if args.domain == 'laptop':
        root = parse_xml(args.raw_test_xml_path)
        texts, texts_merged, labels, labels_merged = get_text_from_xml(root)
        gold_labels = process_labels_semeval_laptop(labels)
        with open(test_path, 'w') as f:
            f.writelines([comment + '\n' for comment in texts])
        with open(test_lab_path, 'w') as f:
            f.writelines([label + '\n' for label in gold_labels])
        
    f1 = codecs.open(test_path, 'r', 'utf-8')
    f2 = codecs.open(test_lab_path, 'r', 'utf-8')

    out1 = codecs.open(os.path.join(preprocessed_fold, 'test.txt'), 'w', 'utf-8')
    out2 = codecs.open(os.path.join(preprocessed_fold, 'test_label.txt'), 'w', 'utf-8')

    for text, label in zip(f1, f2):
        label = label.strip()
        if label not in args.aspects:
            continue
        tokens = parseSentence(text)
        if len(tokens) > 0:
            out1.write(' '.join(tokens) + '\n')
            out2.write(label+'\n')


def preprocess(args, train_path, test_path, test_lab_path, preprocessed_fold):
    if not os.path.exists(preprocessed_fold):
        os.makedirs(preprocessed_fold)
        print('Folder "{}" created!'.format(preprocessed_fold))
    print("Processing train data!")
    preprocess_train(train_path, preprocessed_fold)
    print("Processing test data!")
    preprocess_test(args, test_path, test_lab_path, preprocessed_fold)
    
# W2V training
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in codecs.open(self.filename, 'r', 'utf-8'):
            yield line.split()


def word2vec(train_path, w2v_path, size=200, window=5, min_count=10):
    sentences = MySentences(train_path)
    model = gensim.models.Word2Vec(sentences, size=size, window=window, min_count=min_count, workers=1)
    model.save(w2v_path)

    


## DOConv Layer
Code from [DOConv GitHub](https://github.com/yangyanli/DO-Conv)

In [4]:
import math
import torch
import numpy as np
from torch.nn import init
from itertools import repeat
from torch.nn import functional as F
from torch._six import container_abcs
from torch._jit_internal import Optional
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class DOConv2d(Module):
    """
       DOConv2d can be used as an alternative for torch.nn.Conv2d.
       The interface is similar to that of Conv2d, with one exception:
            1. D_mul: the depth multiplier for the over-parameterization.
       Note that the groups parameter switchs between DO-Conv (groups=1),
       DO-DConv (groups=in_channels), DO-GConv (otherwise).
    """
    __constants__ = ['stride', 'padding', 'dilation', 'groups',
                     'padding_mode', 'output_padding', 'in_channels',
                     'out_channels', 'kernel_size', 'D_mul']
    __annotations__ = {'bias': Optional[torch.Tensor]}

    def __init__(self, in_channels, out_channels, kernel_size, D_mul=None, stride=1,
                 padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'):
        super(DOConv2d, self).__init__()

        kernel_size = _pair(kernel_size)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)

        if in_channels % groups != 0:
            raise ValueError('in_channels must be divisible by groups')
        if out_channels % groups != 0:
            raise ValueError('out_channels must be divisible by groups')
        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
        if padding_mode not in valid_padding_modes:
            raise ValueError("padding_mode must be one of {}, but got padding_mode='{}'".format(
                valid_padding_modes, padding_mode))
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.padding_mode = padding_mode
        self._padding_repeated_twice = tuple(x for x in self.padding for _ in range(2))

        #################################### Initailization of D & W ###################################
        M = self.kernel_size[0]
        N = self.kernel_size[1]
        self.D_mul = M * N if D_mul is None or M * N <= 1 else D_mul
        self.W = Parameter(torch.Tensor(out_channels, in_channels // groups, self.D_mul))
        init.kaiming_uniform_(self.W, a=math.sqrt(5))

        if M * N > 1:
            self.D = Parameter(torch.Tensor(in_channels, M * N, self.D_mul))
            init_zero = np.zeros([in_channels, M * N, self.D_mul], dtype=np.float32)
            self.D.data = torch.from_numpy(init_zero)

            eye = torch.reshape(torch.eye(M * N, dtype=torch.float32), (1, M * N, M * N))
            D_diag = eye.repeat((in_channels, 1, self.D_mul // (M * N)))
            if self.D_mul % (M * N) != 0:  # the cases when D_mul > M * N
                zeros = torch.zeros([in_channels, M * N, self.D_mul % (M * N)])
                self.D_diag = Parameter(torch.cat([D_diag, zeros], dim=2), requires_grad=False)
            else:  # the case when D_mul = M * N
                self.D_diag = Parameter(D_diag, requires_grad=False)
        ##################################################################################################

        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.W)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)
        else:
            self.register_parameter('bias', None)

    def extra_repr(self):
        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        if self.padding != (0,) * len(self.padding):
            s += ', padding={padding}'
        if self.dilation != (1,) * len(self.dilation):
            s += ', dilation={dilation}'
        if self.groups != 1:
            s += ', groups={groups}'
        if self.bias is None:
            s += ', bias=False'
        if self.padding_mode != 'zeros':
            s += ', padding_mode={padding_mode}'
        return s.format(**self.__dict__)

    def __setstate__(self, state):
        super(DOConv2d, self).__setstate__(state)
        if not hasattr(self, 'padding_mode'):
            self.padding_mode = 'zeros'

    def _conv_forward(self, input, weight):
        if self.padding_mode != 'zeros':
            return F.conv2d(F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
                            weight, self.bias, self.stride,
                            _pair(0), self.dilation, self.groups)
        return F.conv2d(input, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)

    def forward(self, input):
        M = self.kernel_size[0]
        N = self.kernel_size[1]
        DoW_shape = (self.out_channels, self.in_channels // self.groups, M, N)
        if M * N > 1:
            ######################### Compute DoW #################
            # (input_channels, D_mul, M * N)
            D = self.D + self.D_diag
            W = torch.reshape(self.W, (self.out_channels // self.groups, self.in_channels, self.D_mul))

            # einsum outputs (out_channels // groups, in_channels, M * N),
            # which is reshaped to
            # (out_channels, in_channels // groups, M, N)
            DoW = torch.reshape(torch.einsum('ims,ois->oim', D, W), DoW_shape)
            #######################################################
        else:
            # in this case D_mul == M * N
            # reshape from
            # (out_channels, in_channels // groups, D_mul)
            # to
            # (out_channels, in_channels // groups, M, N)
            DoW = torch.reshape(self.W, DoW_shape)
        return self._conv_forward(input, DoW)


def _ntuple(n):
    def parse(x):
        if isinstance(x, container_abcs.Iterable):
            return x
        return tuple(repeat(x, n))

    return parse


_pair = _ntuple(2)

## Data Vectorization classes
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [5]:
class Vocabulary(object):
    """ Class to process text and extract vocabulary for mapping
    
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            mask_token (str): the MASK token to add into the Vocabulary; indicates
                a position that will not be used in updating the model's parameters
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary

    """
    def __init__(self, token_to_idx=None, mask_token="<MASK>", unk_token="<UNK>", num_token='<NUM>'):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}

        self._unk_token = unk_token
        self._mask_token = mask_token
        self._num_token = num_token
        
        self.unk_index = self.add_token(unk_token) 
        self.num_index = self.add_token(num_token)
        self.mask_index = self.add_token(self._mask_token)
        
    def to_serializable(self):
        """Returns a dictionary that can be serialized"""
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token, 
                'mask_token': self._mask_token,
                'num_token': self._num_token,
               }

    @classmethod
    def from_serializable(cls, contents):
        """Instantiates the Vocabulary from a serialized dictionary"""
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token

        Args:
            token (str): the item to add into the Vocabulary

        Returns:
            index (int): the integer corresponding to the token

        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token

        return index
            
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens

        Returns:
            indices (list): a list of indices corresponding to the tokens

        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token or the UNK index if token isn't present
        
        Args:
            token (str): the token to look up 

        Returns:
            index (int): the index corresponding to the token

        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) for the UNK functionality 

        """
        if self.is_number(token):
            return self.num_index
            
        return self._token_to_idx.get(token, self.unk_index)
    
    def is_number(self, token):
        """Returns true if token in number else false"""
        num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

        return bool(num_regex.match(token))

    def lookup_index(self, index):
        """Token associated with the index
        
        Args: 
            index (int): the index to look up

        Returns:
            token (str): the token corresponding to the index

        Raises:
            KeyError: if the index is not in the Vocabulary

        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

class Vectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use
    
        Args:
            vocab (Vocabulary): maps words to integers

    """    
    def __init__(self, vocab):
        self.vocab = vocab

    def vectorize(self, context, vector_length=-1):
        """Vectorizer

        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector

        """

        indices = [self.vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)
        
        if vector_length >= len(indices):
            out_vector = np.zeros(vector_length, dtype=np.int64)
            out_vector[-len(indices):] = indices
            out_vector[:-len(indices)] = self.vocab.mask_index
        
        else:
            out_vector = np.array(indices[:vector_length], dtype=np.int64)
            
        return out_vector
    
    @classmethod
    def from_dataframe(cls, df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            df(pandas.DataFrame): the target dataset

        Returns:
            an instance of the Vectorizer

        """
        vocab = Vocabulary()
        for index, row in df.iterrows():
            for token in row.context.split(' '):
                vocab.add_token(token)
            vocab.add_token(row.target)
            
        return cls(vocab)

    @classmethod
    def from_serializable(cls, contents):
        vocab = Vocabulary.from_serializable(contents['vocab'])
        return cls(vocab=vocab)

    def to_serializable(self):
        return {'vocab': self.vocab.to_serializable()}

### The Dataset
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [6]:
class Dataset(Dataset):
    """ Dataset reader

        Args:
            df(pandas.DataFrame): the dataset
            vectorizer (Vectorizer): vectorizer instatiated from dataset

    """
    def __init__(self, train_df, test_df, vectorizer, max_length):

        self.train_df = train_df
        self.train_size = len(train_df)
        
        self.test_df = test_df
        self.test_size = len(test_df)
        
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        
        if max_length < 0:
            self._max_seq_length = max(map(measure_len, train_df.sentence))
        else:
            self._max_seq_length = max_length
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'test': (self.test_df, self.test_size)
        }
        
        self.set_split('train')
        
    @property
    def max_seq_length(self):
        """Max dataset sequence len"""
        return self._max_seq_length

    def set_split(self, split="train"):
        """Selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def get_vectorizer(self):
        """Returns the vectorizer"""
        return self._vectorizer

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """Primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 

        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)

        """
        row = self._target_df.iloc[index]
        target = 0.

        context_vector = self._vectorizer.vectorize(row.sentence, self._max_seq_length)

        return {
            'x_data': context_vector,
            'y_target': 0. if self._target_split == 'train' else row.label,
        }

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)

        Returns:
            number of batches in the dataset

        """
        return len(self) // batch_size
    

### Utils
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [7]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """A generator function which wraps the PyTorch DataLoader. It will ensure 
        each tensor is on the write device location
        
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device) \
            if isinstance(data_dict[name], torch.Tensor) else data_dict[name]
        yield out_data_dict
        
def preprocess_text(text):
    """Text preprocessing regular expression"""
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def get_centroids(w2v_model, aspects_count):
    """Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for aspects matrix initialization
    """
    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.vocab:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)

    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix

# Attentions
SelfAttentionDOConv` class code is from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction) modified based on [DOConv GitHub](https://github.com/yangyanli/DO-Conv)

In [46]:
class SelfAttentionDOConv(nn.Module):
    """
    Convolutional multidimensial attention with DOConv Layers
    Args:
        wv_dim: word vector sizeluence
        maxlen: sentence max length taken into account
        asp_count: aspect number
    """
    def __init__(self, wv_dim, maxlen, asp_count):
        super(SelfAttentionDOConv, self).__init__()
        self.wv_dim = wv_dim

        self.maxlen = maxlen
        self.tanh = nn.Tanh()
        self.attention_softmax = torch.nn.Softmax(dim=1)
        self.asp_count = asp_count

        # groups number equal to aspect number allows keeping the number of channel
        # one aspect - one attention channel
        self.conv1_3 = DOConv2d(self.asp_count, self.asp_count, kernel_size=(3, self.wv_dim), stride=(1, 1), padding=(1, 0), dilation=(1, 1), groups=self.asp_count)
        self.conv1_5 = DOConv2d(self.asp_count, self.asp_count, kernel_size=(5, self.wv_dim), stride=(1, 1), padding=(2, 0), dilation=(1, 1), groups=self.asp_count)
        self.conv1_7 = DOConv2d(self.asp_count, self.asp_count, kernel_size=(7, self.wv_dim), stride=(1, 1), padding=(3, 0), dilation=(1, 1), groups=self.asp_count)
        self.conv1_1 = DOConv2d(self.asp_count, self.asp_count, kernel_size=(1, self.wv_dim), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=self.asp_count)

    def forward(self, input_embeddings): 
        # expend dimensions of input data for each aspect
        mean_embedding = torch.mean(input_embeddings, (1,)).unsqueeze(2)    
        input_embeddings = input_embeddings.unsqueeze(1).repeat(1, self.asp_count, 1, 1)
        # calculate convolutions
        C_1 = self.conv1_1(input_embeddings)
        C_3 = self.conv1_3(input_embeddings)
        C_5 = self.conv1_5(input_embeddings)
        C_7 = self.conv1_7(input_embeddings)

        # concatinate convolutions and take mean to give one number for each word in the sentence
        results = torch.cat([C_1, C_3, C_5, C_7], -1).mean(-1)
        results = results.mean(1)
        # Use softmax activation to get attention probabilities 
        results = self.attention_softmax(self.tanh(results)) # imported  
        return results
    
    def extra_repr(self):
        return 'wv_dim={}, maxlen={}'.format(self.wv_dim, self.maxlen)

# The model
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [47]:
class ABAE(torch.nn.Module):
    """ The model described in the paper ``An Unsupervised Neural Attention Model for Aspect Extraction''
        by He, Ruidan and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel, ACL2017
        https://aclweb.org/anthology/papers/P/P17/P17-1036/. 
        
        Based on implementation by Anton Alekseev: ''https://github.com/alexeyev/abae-pytorch''. 
        Changes: 
        - removed ortho regularization;
        - changed activation function;
        - embedding added in model;
        - added tanh function to attention output;
        - added encoder output flag.
        
        Args:
            wv_dim: word vector size
            asp_count: number of aspects
            ortho_reg: coefficient for tuning the ortho-regularizer's influence
            maxlen: sentence max length taken into account
            init_aspects_matrix: None or init. matrix for aspects
            pretrained_embedding: w2v vectors
            encoder_only: bool - return output after encoding
            padding_index: Mask index

    """
    def __init__(
        self, 
        wv_dim, 
        asp_count,
        maxlen, 
        init_aspects_matrix,
        pretrained_embedding,
        padding_index,
        encoder_only=False,
        attention_mech = 'abae',
    ):
        super(ABAE, self).__init__()
        self.wv_dim = wv_dim
        self.asp_count = asp_count
        self.maxlen = maxlen

        self.embedding = torch.nn.Embedding.from_pretrained(
            torch.FloatTensor(pretrained_embedding), 
            padding_idx=padding_index,
        )
        
        self.attention_mech = attention_mech
        if self.attention_mech == 'DOConv':
            self.attention = SelfAttentionDOConv(wv_dim, maxlen, asp_count) #DOConv attention         
        
        self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
        self.softmax_aspects = torch.nn.Softmax(dim=1)
        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))

        if init_aspects_matrix is None:
            torch.nn.init.xavier_uniform(self.aspects_embeddings)
        else:
            self.aspects_embeddings.data = torch.from_numpy(init_aspects_matrix.T)
            
        self.encoder_only = encoder_only

        
        self.ortho = 0.1 #ortogonal regularisation
        
        
        print('===================================')
        print('Training/Evaluating using {}'.format(self.attention_mech))

        
    def get_aspects_importances(self, text_embeddings):
        """Get aspect importances
        
        Args:
            text_embedding: embeddings of a sentence as input
        
        Returns: 
            attention weights, aspects_importances, weighted_text_emb

        """
        # compute attention scores, looking at text embeddings average
        attention_weights = self.attention(text_embeddings)
        # multiplying text embeddings by attention scores -- and summing
        # (matmul: we sum every word embedding's coordinate with attention weights)
        weighted_text_emb = torch.matmul(attention_weights.unsqueeze(1),  # (batch, 1, sentence)
                                         text_embeddings  # (batch, sentence, wv_dim)
                                         ).squeeze()   
        assert (self.attention_mech in ['DOConv']), "Cound not understand attention_mech: {}. It should ne: abae, cmam, XSepConv, DOConv, XSepDOConv".format(self.attention_mech)

        # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
        raw_importances = self.linear_transform(weighted_text_emb)

        # computing 'aspects distribution in a sentence'
        aspects_importances = self.softmax_aspects(raw_importances)

        return attention_weights, aspects_importances, weighted_text_emb

    def forward(self, text_embeddings, negative_samples_texts):
        
        text_embeddings = self.embedding(text_embeddings)

        # encoding: words embeddings -> sentence embedding, aspects importances
        attention_weights, aspects_importances, weighted_text_emb = self.get_aspects_importances(text_embeddings)
        
        if self.encoder_only:
            return aspects_importances, attention_weights
        else:
            negative_samples_texts = self.embedding(negative_samples_texts)
            
            # negative samples are averaged
            averaged_negative_samples = torch.mean(negative_samples_texts, dim=1)
            averaged_negative_samples = torch.mean(averaged_negative_samples, dim=1)
            
            # decoding: aspects embeddings matrix, aspects_importances -> recovered sentence embedding
            recovered_emb = torch.matmul(self.aspects_embeddings, aspects_importances.unsqueeze(2)).squeeze()
              
            return weighted_text_emb, recovered_emb, averaged_negative_samples, self.aspects_embeddings.t()

    def get_aspect_words(self, w2v_model, topn=10):
        """Getting aspects words"""
        words = []
        aspects = self.aspects_embeddings.cpu().detach().numpy()
        words_scores = w2v_model.wv.vectors.dot(aspects)

        for row in range(aspects.shape[1]):
            argmax_scalar_products = np.argsort(- words_scores[:, row])[:topn]
            words.append([w2v_model.wv.index2word[i] for i in argmax_scalar_products])

        return words
    


### Training utils
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [48]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

        
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    
def make_train_state(args):
    return {
        'stop_early': False,
        'early_stopping_step': 0,
        'early_stopping_best_val': 1e8,
        'learning_rate': args.learning_rate,
        'epoch_index': 0,
        'train_loss': [],
        'model_filename': os.path.join(args.save_dir, args.model_state_file)
    }


def update_train_state(args, model, train_state):
    """Handle the training state updates

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    Args:
        args: main arguments
        model: model to train
        train_state: a dictionary representing the training state values
    
    Returns:
        new train_state

    """
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['train_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

# Settings
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction), some parameters are added for new attention mechanism.

In [49]:
base = './'
args = Namespace(
    # raw data path definition
    domain = 'restaurant',
    raw_train_path=base+'data/restaurant/train.txt',
    raw_test_path=base+'data/restaurant/test.txt',
    raw_test_lab_path=base+'data/restaurant/test_label.txt',
    preprocessed_fold=base+'preprocessed_data/restaurant/',
    
    # preprocessed data path definition.
    train_data=base+'preprocessed_data/restaurant/train.txt',
    test_data=base+'preprocessed_data/restaurant/test.txt',
    test_labels=base+'preprocessed_data/restaurant/test_label.txt',
    emb_path=base+'preprocessed_data/restaurant/w2v_embedding',
    aspects=['Food', 'Staff', 'Ambience'],  # aspects for restaurant 


    #word2vec params
    window=5, 
    min_count=10, 
    emb_dim=200,
    
    vocab_size=9000,
    aspect_size=14,
    
    
    # training params  
    batch_size=50,
    epochs=15,
    neg_size=20,
    maxlen=35, # -1 means no limit, set to 15-20 for convolutional attentions

    #losses
    tripletmargin=1,

    cuda=True,
    reload_from_files=False,
    learning_rate=1e-3,
    early_stopping_criteria=5,  
    catch_keyboard_interrupt=True,
    seed=1234,
    attention_mech = 'DOConv', # 'abae', 'cmam', 'XSepConv', 'DOConv', 'XSepDOConv'

    output_dir=base+"outputs", 
    save_dir=base+"model_storage",
    model_state_file="DOConv_latest_april1.pth",


)


args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))
set_seed_everywhere(args.seed, args.cuda)
handle_dirs(args.save_dir)
handle_dirs(args.output_dir)
handle_dirs(args.preprocessed_fold)

Using CUDA: True


# Data modification and preprocessing

### Creation of embedings
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [50]:
import nltk
import json
import xml.etree.ElementTree as ET

import numpy as np

from gensim.models.keyedvectors import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [16]:
if not args.reload_from_files:
    print('Preprocessing raw review sentences ...')
    preprocess(args, args.raw_train_path, args.raw_test_path, args.raw_test_lab_path, args.preprocessed_fold)
    print('Trainig Word2Vec model!')
    word2vec(args.train_data, args.emb_path,  args.emb_dim, args.window, args.min_count)
    print('Done preprocessing!')
else:
    print('Loading Preprocessing files and Word2Vec model from existing files!')

Preprocessing raw review sentences ...
Processing train data!
Processing test data!
Trainig Word2Vec model!
Done preprocessing!


# Training blocks (Jump from here to Evaluation)
Don't run blocks below for evaluation part

### Data preprocessings (skip this block for evaluation)
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [17]:
#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)
# sentences = []
# with open(args.train_data) as fp:
#     for line in fp.readlines():
#         sentences.append(line)
# cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
# train_df = pd.DataFrame(cleaned_sentences, columns=["sentence"])

# sentences = []
# labels = []
# with open(args.test_data) as fp:
#     for line in fp.readlines():
#         sentences.append(line)
# with open(args.test_labels) as fp:
#     for line in fp.readlines():
#         labels.append(line)
# cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
# cleaned_labels = [preprocess_text(label.split()[0]) for label in labels]
# test_df = pd.DataFrame({'sentence': cleaned_sentences, 'label': cleaned_labels})

Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [18]:
#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)
# w2v = gensim.models.Word2Vec.load(args.emb_path)
# token2index_lim = {token: index for index, token in enumerate(w2v.wv.index2word) if index < args.vocab_size}
# token2index_all = {token: index for index, token in enumerate(w2v.wv.index2word)}
# vocab = Vocabulary(token2index_lim)
# vectorizer = Vectorizer(vocab)
# dataset = Dataset(train_df, test_df, vectorizer, max_length = args.maxlen )

### Model initialization (skip this block for evaluation)
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)


In [19]:
# model = ABAE(
#     wv_dim=args.emb_dim,
#     asp_count=args.aspect_size,
#     maxlen=dataset.max_seq_length, 
#     init_aspects_matrix=get_centroids(w2v, args.aspect_size),
#     pretrained_embedding=w2v.wv.vectors,
#     padding_index=vocab.mask_index,
#     attention_mech=args.attention_mech
# )

# model = model.to(args.device)
# # Loss funcs
# loss_func = nn.TripletMarginLoss(margin=args.tripletmargin, swap=False, reduction='mean')

# optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer=optimizer,
#     mode='min', 
#     factor=0.5,
#     patience=1
# )

# train_state = make_train_state(args)

## Model Training (skip this block for evaluation)
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [20]:
# #Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

# epoch_bar = tqdm(
#     desc='training routine', 
#     total=args.epochs,
#     position=1,
# )

# dataset.set_split('train')
# train_bar = tqdm(
#     desc='train',
#     total=dataset.get_num_batches(args.batch_size), 
#     position=1, 
# )

# y = torch.zeros(args.batch_size, 1)

# for epoch_index in range(args.epochs):
    
#     train_state['epoch_index'] = epoch_index
    
#     running_loss = 0.0
#     running_loss1, running_loss2, running_loss3 = 0.0, 0.0, 0.0
#     model.train()
    
#     batch_generator = generate_batches(
#         dataset, 
#         batch_size=args.batch_size, 
#         device=args.device
#     )
    
#     neg_batch_generator = generate_batches(
#         dataset, 
#         batch_size=args.batch_size, 
#         shuffle=False,
#         device=args.device,
#     )

#     for batch_index, batch_dict in enumerate(batch_generator):

#         optimizer.zero_grad()
        
#         x = batch_dict['x_data']
#         y = batch_dict['y_target'].float()
#         x_neg = next(neg_batch_generator)['x_data']

#         negative_samples = torch.stack(
#             tuple([x_neg[torch.randperm(x_neg.shape[0])[:args.neg_size]] 
#                    for _ in range(args.batch_size)])
#         ).to(args.device)

#         anchor, positive, negative, asp_emb = model(x, negative_samples)

#         loss = loss_func(anchor, positive, negative)       
#         loss_t = loss.item()
#         running_loss += (loss_t - running_loss) / (batch_index + 1)    
#         loss.backward()
#         optimizer.step()
        
#         train_bar.set_postfix(loss=running_loss,  epoch=epoch_index) #Ortogonal=running_loss2,
#         train_bar.update()

#     train_state['train_loss'].append(running_loss)
#     train_state = update_train_state(args=args, model=model, train_state=train_state)
#     scheduler.step(train_state['train_loss'][-1])
    
#     # uncomment the lines below to display the loss and aspects words after each training loop
#     print("epoch {}, batches {}, loss {:.5f}, and LR {}:".format(epoch_index, batch_index, running_loss, optimizer.param_groups[0]['lr']))
#     # uncomment the lines below to display aspects words after each training loop
#     for i, aspect in enumerate(model.get_aspect_words(w2v)):
#         print(i, " ".join([a for a in aspect]))
#     print()
    

#     if train_state['stop_early']:
#         break

#     train_bar.n = 0

#     epoch_bar.set_postfix(best_val=train_state['early_stopping_best_val'])
#     epoch_bar.update()

# # save aspect words
# save_aspect_words(model, args, topn=100)

# plt.figure(figsize=(10, 7))
# sns.lineplot(
#     x=[epoch + 1 for epoch in range(len(train_state['train_loss']))],
#     y=train_state['train_loss'],
#     color='coral', 
#     label='loss',
# )

# plt.xticks([epoch for epoch in range(len(train_state['train_loss']) + 1)])
# plt.show()

# Evaluation
Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [36]:
#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction) with rearanging code
 
sentences = []
with open(args.train_data) as fp:
    for line in fp.readlines():
        sentences.append(line)
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
train_df = pd.DataFrame(cleaned_sentences, columns=["sentence"])

sentences = []
labels = []
with open(args.test_data) as fp:
    for line in fp.readlines():
        sentences.append(line)
with open(args.test_labels) as fp:
    for line in fp.readlines():
        labels.append(line)
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
cleaned_labels = [preprocess_text(label.split()[0]) for label in labels]

test_df = pd.DataFrame({'sentence': cleaned_sentences, 'label': cleaned_labels})

w2v = gensim.models.Word2Vec.load(args.emb_path)
token2index_lim = {token: index for index, token in enumerate(w2v.wv.index2word) if index < args.vocab_size}
token2index_all = {token: index for index, token in enumerate(w2v.wv.index2word)}
vocab = Vocabulary(token2index_lim)
vectorizer = Vectorizer(vocab)
dataset = Dataset(train_df, test_df, vectorizer, max_length = args.maxlen)


Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [38]:
#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))
model = ABAE(
    wv_dim=args.emb_dim,
    asp_count=args.aspect_size,
    maxlen=dataset.max_seq_length,
    init_aspects_matrix=get_centroids(w2v, args.aspect_size),
    pretrained_embedding=w2v.wv.vectors,
    padding_index=vocab.mask_index,
    attention_mech=args.attention_mech
)

train_state = make_train_state(args)
model.load_state_dict(torch.load(train_state['model_filename'], map_location=args.device))
model = model.to(args.device)
loss_func = torch.nn.MSELoss(reduction="sum")
dataset.set_split('test')

print('=============================================')
for i, aspect in enumerate(model.get_aspect_words(w2v)):
        print(i, " ".join([a for a in aspect]))

Using CUDA: True
Training/Evaluating using DOConv
0 atmosphere cozy setting romantic lighting intimate space decor comfortable ambience
1 review experience expectation anniversary filet birthday dined bass rating reviewer
2 sauce dish tomato mushroom chicken grilled vegetable roasted shrimp flavor
3 wine fixe prix bottle selection excellent great entree beer appetizer
4 city nyc best brooklyn ny manhattan lived park york favorite
5 pork lamb lobster grilled tuna shrimp rib recommend beef tomato
6 dry fry bland tasteless salty burnt overpriced onion greasy crust
7 food cuisine price quality fare sushi value priced authentic portion
8 wall ceiling wood chair window glass white red booth lit
9 staff hostess waiter server waitstaff manager waitress bartender service rude
10 go wanted want decided someone try anyone going friend eat
11 table minute u asked seated seat reservation min told waited
12 saturday friday night weekend went sunday early afternoon reservation late
13 potato chocolat

Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [39]:
df1 = pd.DataFrame()
for i, aspect in enumerate(model.get_aspect_words(w2v)):
        print(i, " ".join([a for a in aspect]))
        df1 = df1.append({'Aspects' : " ".join([a for a in aspect])}, ignore_index=True)

0 atmosphere cozy setting romantic lighting intimate space decor comfortable ambience
1 review experience expectation anniversary filet birthday dined bass rating reviewer
2 sauce dish tomato mushroom chicken grilled vegetable roasted shrimp flavor
3 wine fixe prix bottle selection excellent great entree beer appetizer
4 city nyc best brooklyn ny manhattan lived park york favorite
5 pork lamb lobster grilled tuna shrimp rib recommend beef tomato
6 dry fry bland tasteless salty burnt overpriced onion greasy crust
7 food cuisine price quality fare sushi value priced authentic portion
8 wall ceiling wood chair window glass white red booth lit
9 staff hostess waiter server waitstaff manager waitress bartender service rude
10 go wanted want decided someone try anyone going friend eat
11 table minute u asked seated seat reservation min told waited
12 saturday friday night weekend went sunday early afternoon reservation late
13 potato chocolate banana mango lemon cream butter coconut creme cr

In [40]:
#strip text
df1['Colum']= df1['Aspects'].apply(lambda x: [x for x in x.strip(" ").lower().split()])

In [42]:
#considered entities
entity =['food','staff','ambience']

In [43]:
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('pretrained_model/GoogleNews-vectors-negative300.bin', binary=True)
model_vocab = list(model1.vocab.keys())
result1=[]
result3=[]
result4=[]
dict_scores = {}
for ii in range(len(df1)):    
    for j in range(10):
        for i in entity:
            pos = i
            if df1.Colum[ii][j] in model_vocab:
                result = model1.similarity(pos,df1.Colum[ii][j])
                result3.append(((result,pos,df1.Colum[ii][j])))
                result4.append(((result,pos,df1.Colum[ii][j])))
    dict_scores[ii]=result4
    result4=[]
        #result4.append(result3)

In [44]:
food=0
staf=0
amb=0
food_dictionary={}
staf_dictionary={}
amb_dictionary={}

j=0
for d in dict_scores.values():
    #print(d)
    
    food_count = 0
    food=0
    staf=0
    amb=0
    for i in range(len(d)):
        if(i%3==0):
            food+=d[i][0]
            food_count+=1
            staf+=d[i+1][0]            
            amb+=d[i+2][0]

            
    food_score=food/food_count
    food_dictionary[j]=food_score
    
    staf_score=staf/food_count
    staf_dictionary[j]=staf_score
    
    amb_score=amb/food_count
    amb_dictionary[j]=amb_score
    j+=1
print("*******Food***********")
print(food_dictionary)
print("*******Staff**********")
print(staf_dictionary)
print("*******Amb***********")
print(amb_dictionary)


*******Food***********
{0: 0.1510377686470747, 1: 0.08288275175727904, 2: 0.2982382267713547, 3: 0.1790340581908822, 4: 0.07333283429034054, 5: 0.29235954619944093, 6: 0.20208619087934493, 7: 0.29501340016722677, 8: 0.059674300532788035, 9: 0.1733293980360031, 10: 0.1280304319690913, 11: 0.062248680368065835, 12: 0.05398568734526634, 13: 0.20696413442492484}
*******Staff**********
{0: 0.1036470353603363, 1: 0.09852286404930055, 2: 0.02783063028473407, 3: 0.03218751852982678, 4: 0.029503638856112957, 5: 0.05186868025921285, 6: 0.026320598064921798, 7: 0.07108102194033564, 8: 0.06529045244678855, 9: 0.28852002173662183, 10: 0.09105376675724983, 11: 0.07523254407569765, 12: 0.05683073757681996, 13: 0.013206278797588311}
*******Amb***********
{0: 0.4129987359046936, 1: 0.14091617222875358, 2: 0.17418329119682313, 3: 0.16672823280096055, 4: 0.11718497835099698, 5: 0.07533262809738517, 6: 0.16679588481783866, 7: 0.22692797444760798, 8: 0.10966069282731042, 9: 0.18574624881148338, 10: 0.08086

#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)

In [45]:
#Code from [ABAE GitHub](https://github.com/KirillKrasikov/Attention-And-Capsule-Based-Aspect-Extraction)
# ignores aspect clusters
def ignoreFunction(aspect_probs, cluster_map): 
    def get_label(lst, aspect_gold=cluster_map):
        idxs = np.argsort(np.array(lst))[::-1]
        i = 0
        while cluster_map[idxs[i]] =='ignore':
            i += 1
        return idxs[i]
    copy = aspect_probs.copy()
    label_ids = np.apply_along_axis(get_label, 1, copy)
    return label_ids.astype('int')


model.eval()
model.encoder_only = True
aspect_probs = []
predictions = []
targets = []
word_weights = np.empty((0,dataset.max_seq_length))

batch_generator = generate_batches(
    dataset,
    batch_size=args.batch_size,
    device=args.device,
    drop_last=False
)

with torch.no_grad():
    for batch_index, batch_dict in enumerate(batch_generator):

        x = batch_dict['x_data']
        y_target = batch_dict['y_target']

        y_pred, word_weights_batch = model(x, None)
        word_weights = np.concatenate((word_weights, word_weights_batch.cpu().numpy()), axis=0)

        for pred, target in zip(y_pred, y_target):
            aspect_probs.append(pred.cpu().numpy())
#             predictions.append(pred.cpu().numpy().argmax())
            targets.append(target)

_ = ['ambience', 'food', 'miscellaneous', 'price', 'staff', 'anecdotes', 'ignore']

cluster_map = {
    0: 'ambience', 
    1: 'ignore', 
    2: 'food', 
    3: 'ignore',
    4: 'ignore', 
    5: 'food', 
    6: 'food',  
    7: 'food',
    8: 'ignore', 
    9: 'staff', 
    10: 'ignore', 
    11: 'ignore', 
    12: 'ignore', 
    13: 'food'
}

predictions = ignoreFunction(aspect_probs, cluster_map)
y_pred = [cluster_map[pred] for pred in predictions]
y_true = targets
print(classification_report(y_true, y_pred, digits=4 , labels=np.unique(y_pred)))

              precision    recall  f1-score   support

    ambience     0.7737    0.7490    0.7611       251
        food     0.8659    0.9538    0.9077       887
       staff     0.8926    0.6847    0.7749       352

    accuracy                         0.8557      1490
   macro avg     0.8441    0.7958    0.8146      1490
weighted avg     0.8567    0.8557    0.8517      1490

