In [237]:
import numpy as np
import os
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import shuffle
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split

In [238]:
# load the json file into a list
with open('./acsa-restaurant-large/acsa_train.json','rb') as f:
    data1 = json.load(f)

with open('./acsa-restaurant-large/acsa_test.json','rb') as f:
    data2 = json.load(f)

sentence_data = [x['sentence'] for x in data1] + [x['sentence'] for x in data2]
aspect_data = [x['aspect'] for x in data1] + [x['aspect'] for x in data2]
sentiment_data = [x['sentiment'] for x in data1] + [x['sentiment'] for x in data2]

sentence_data, aspect_data, sentiment_data = shuffle(sentence_data, aspect_data, sentiment_data)

# print('Number of sentences: ', len(sentence_data))
# print('Number of aspects: ', len(aspect_data))
# print('Number of sentiments: ', len(sentiment_data))


In [239]:
data_words = {}
for example in sentence_data:
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
        if word not in data_words:
            data_words[word] = 0
        else:
            data_words[word] += 1

In [240]:
aspect_categories = {}
for example in aspect_data:
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
        if word not in aspect_categories:
            aspect_categories[word] = 0
        else:
            aspect_categories[word] += 1

In [241]:
print(len(data_words))
print(len(aspect_categories))

6959
8


In [242]:
# Load glove vectors
glove_folder = os.path.join(os.getcwd(), 'glove_file')

# get path of glove.6B.300d.txt file in test folder
glove_file = os.path.join(glove_folder, 'glove.6B.300d.txt')

def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    with open(glove_file, 'r', encoding="utf8") as f:
        embs = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]

            if curr_word in data_words:
                try:
                    embedding = np.array([float(value) for value in line[1:]])
                    embs[curr_word] = embedding
                except:
                    print('error loading embedding')
    return words, word_to_vec_map, embs

glove_words, glove_word_to_vec_map, data_word_to_vec_map = load_glove_vectors(glove_file)
# print(len(words))
# print(len(word_to_vec_map))
# print(word_to_vec_map['the'])
# print(word_to_vec_map['the'].shape)


In [243]:
def get_aspect_catogories_glove_embedding(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        embs = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]

            if curr_word in aspect_categories:
                try:
                    embedding = np.array([float(value) for value in line[1:]])
                    embs[curr_word] = embedding
                except:
                    print('error loading embedding')
    return embs

aspect_catogories_to_vec_map = get_aspect_catogories_glove_embedding(glove_file)

In [244]:
unknown_word_vector = np.mean(list(glove_word_to_vec_map.values()), axis=0)

In [245]:
print(len(data_words))
print(len(data_word_to_vec_map))
missing_words = len(data_words) - len(data_word_to_vec_map)
print(missing_words)

6959
5449
1510


In [246]:
print(len(aspect_categories))
print(len(aspect_catogories_to_vec_map))
missing_aspect_categories_words = len(aspect_catogories_to_vec_map) - len(aspect_categories)
print(missing_aspect_categories_words)

8
8
0


In [247]:
embedding_matrix = []
idx2word = []
word2idx = {}
embedding_matrix.append(np.zeros(300)) # this will be our zero padding for the network
idx2word.append('')
word2idx[''] = 0
for i, (word, emb) in enumerate(data_word_to_vec_map.items()):
    embedding_matrix.append(emb)
    idx2word.append(word)
    word2idx[word] = i + 1
    # word2idx[word] = i
embedding_matrix = np.asarray(embedding_matrix)

In [248]:
ac_embedding_matrix = []
ac_idx2word = []
ac_word2idx = {}
# ac_embedding_matrix.append(np.zeros(300)) # this will be our zero padding for the network
# ac_idx2word.append('')
# ac_word2idx[''] = 0
for i, (word, emb) in enumerate(aspect_catogories_to_vec_map.items()):
    ac_embedding_matrix.append(emb)
    ac_idx2word.append(word)
    # ac_word2idx[word] = i + 1
    ac_word2idx[word] = i
ac_embedding_matrix = np.asarray(ac_embedding_matrix)

In [249]:
x_train = []
for example in sentence_data:
    temp = []
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
        if word in word2idx:
            temp.append(word2idx[word])
    # if len(temp) == 0:
    #     print(example)
    x_train.append(temp)

In [250]:
ac_train = []
for example in aspect_data:
    temp = []
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
        if word in ac_word2idx:
            temp.append(ac_word2idx[word])
    # if len(temp) == 0:
    #     print(example)
    ac_train.append(temp)

In [251]:
x_train = np.asarray(x_train, dtype=object)

In [252]:
ac_train = np.asarray(ac_train, dtype=object)

In [253]:
print(x_train.shape)

(7091,)


In [254]:
print(ac_train.shape)

(7091, 1)


In [255]:
max_length = 0
min_length = 1000
for example in x_train:
    if len(example) > max_length:
        max_length = len(example)
    if len(example) < min_length:
        min_length = len(example)

In [256]:
print(max_length)
print(min_length)

64
0


In [257]:
total_length = 0
for i in range(len(x_train)):
    total_length += len(x_train[i])
avg_length = total_length / len(x_train)
print(avg_length)

13.753631363700466


In [258]:
for i in range(len(x_train)):
    x_train[i] = np.pad(x_train[i], (max_length - len(x_train[i]), 0), 'constant')

In [259]:
x_train_data = []
for x in x_train:
    x_train_data.append([k for k in x])

x_train_data = np.array(x_train_data)


In [260]:
ac_train_data = []
for x in ac_train:
    ac_train_data.append([k for k in x])

ac_train_data = np.array(ac_train_data)


In [261]:
print(embedding_matrix.shape)
print(len(data_word_to_vec_map))
print(len(data_words))
print(np.array(sentence_data).shape)
print(np.array(aspect_data).shape)
print(ac_embedding_matrix.shape)

(5450, 300)
5449
6959
(7091,)
(7091,)
(8, 300)


In [262]:
print(x_train_data.shape)
print(ac_train_data.shape)

(7091, 64)
(7091, 1)


In [263]:
print(embedding_matrix.shape)

(5450, 300)


In [264]:
class CNN_Gate_Aspect_Text(nn.Module):
    def __init__(self, embedding_matrix, class_num, kernel_num, kernel_sizes, aspect_matrix):
        super(CNN_Gate_Aspect_Text, self).__init__()
        # self.args = args
        
        V = embedding_matrix.shape[0]
        D = embedding_matrix.shape[1]
        C = class_num
        A = aspect_matrix.shape[0]

        Co = kernel_num
        Ks = kernel_sizes

        self.embed = nn.Embedding(V, D)
        self.embed.load_state_dict({'weight': torch.tensor(embedding_matrix)})
        # self.embed.weight = nn.Parameter(embedding_matrix, requires_grad=True)
        self.embed.weight.requires_grad = True

        self.aspect_embed = nn.Embedding(A, aspect_matrix.shape[1])
        self.aspect_embed.load_state_dict({'weight':  torch.tensor(aspect_matrix)})
        # self.aspect_embed.weight = nn.Parameter(aspect_matrix, requires_grad=True)
        self.aspect_embed.weight.requires_grad = True

        self.convs1 = nn.ModuleList([nn.Conv1d(D, Co, K) for K in Ks])
        self.convs2 = nn.ModuleList([nn.Conv1d(D, Co, K) for K in Ks])

        self.fc1 = nn.Linear(len(Ks)*Co, C)
        self.fc_aspect = nn.Linear(aspect_matrix.shape[1], Co)

    def forward(self, feature, aspect):
        feature = self.embed(feature)  # (N, L, D)
        aspect_v = self.aspect_embed(aspect)  # (N, L', D)
        aspect_v = aspect_v.sum(1) / aspect_v.size(1)

        x = [F.tanh(conv(feature.transpose(1, 2))) for conv in self.convs1]  # [(N,Co,L), ...]*len(Ks)
        y = [F.relu(conv(feature.transpose(1, 2)) + self.fc_aspect(aspect_v).unsqueeze(2)) for conv in self.convs2]
        x = [i*j for i, j in zip(x, y)]

        # pooling method
        x0 = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N,Co), ...]*len(Ks)
        x0 = [i.view(i.size(0), -1) for i in x0]

        x0 = torch.cat(x0, 1)
        logit = self.fc1(x0)  # (N,C)
        return logit, x, y


In [265]:
sentiments = {}

# get unique sentiments in sentiment data
for sentiment in sentiment_data:
    if sentiment not in sentiments:
        sentiments[sentiment] = 1
    else:
        sentiments[sentiment] += 1

print(sentiments)

sentiment_input = []
for sentiment in sentiment_data:
    if sentiment == 'positive':
        sentiment_input.append(2)
    elif sentiment == 'negative':
        sentiment_input.append(0)
    else:
        sentiment_input.append(1)

sentiment_input = np.array(sentiment_input)
print(sentiment_input.shape)

{'negative': 1878, 'positive': 4215, 'neutral': 998}
(7091,)


In [266]:
# dataset = CustomImageDataset(x_train_data, labels)

train_length = int(len(sentence_data) * 0.8) # 80% training data, 20% test data
test_length = len(sentence_data) - train_length

# print(x_train_data.shape)

# concatenate the x_train_data, ac_train_data and sentiment_input
x_train_data = np.concatenate((x_train_data, ac_train_data), axis=1)
# print(x_train_data.shape)
x_train_data = np.concatenate((x_train_data, sentiment_input.reshape(-1, 1)), axis=1)

# print(x_train_data.shape)



# print(len(x_train_dataloader) * batch_size)
# print(len(y_test_dataloader) * batch_size)

In [267]:
# split x_train_data into training and test data using train_test_split
x_train, x_test = train_test_split(x_train_data, test_size=0.2, random_state=42)

print(x_train.shape)

(5672, 66)


In [271]:
batch_size = 32
# x_train = torch.tensor(x_train.astype('float64')).to(torch.int64)
train_batches = DataLoader(x_train, batch_size=batch_size, shuffle=True)

# for data in train_batches:
#     # print(data)
#     # break
# # convert data to int tensor
#     # data = data.to(torch.int64)
#     print(data.shape)
#     print(data[:, :-2].shape)
#     break

torch.Size([32, 66])
torch.Size([32, 64])


In [273]:
def train():

    loss_function = nn.CrossEntropyLoss()
    model = CNN_Gate_Aspect_Text(embedding_matrix, 3, 100, [3, 4, 5], ac_embedding_matrix)
    # model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(10):
        for i, data in enumerate(train_batches):
            sentence = data[:, :-2]
            aspect = data[:, -2]
            sentiment = data[:, -1]

            # x = x.to(device)
            # y = y.to(device)
            # optimizer.zero_grad()
            model.zero_grad()
            # model.zero_grad()
            logit, x, y = model(sentence, aspect)
            loss = loss_function(logit, y)
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print('Epoch: ', epoch, 'Batch: ', i, 'Loss: ', loss.item())
    return model

model = train()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x32 and 300x100)