In [1]:
# import torch
import numpy as np
import os
import pandas as pd
import json

In [2]:
# load the json file into a list
with open('./acsa-restaurant-large/acsa_train.json','rb') as f:
    data = json.load(f)

sentence_data = [x['sentence'] for x in data]
aspect_data = [x['aspect'] for x in data]
sentiment_data = [x['sentiment'] for x in data]

# print('Number of sentences: ', len(sentence_data))
# print('Number of aspects: ', len(aspect_data))
# print('Number of sentiments: ', len(sentiment_data))


In [3]:
data_words = {}
for example in sentence_data:
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
            if word not in data_words:
                data_words[word] = 0
            else:
                data_words[word] += 1

In [5]:
# Load glove vectors
glove_folder = os.path.join(os.getcwd(), 'glove_file')

# get path of glove.6B.300d.txt file in test folder
glove_file = os.path.join(glove_folder, 'glove.6B.300d.txt')

def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    with open(glove_file, 'r', encoding="utf8") as f:
        embs = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]

            if curr_word in data_words:
                try:
                    embedding = np.array([float(value) for value in line[1:]])
                    embs[curr_word] = embedding
                except:
                    print('error loading embedding')
    return words, word_to_vec_map, embs

glove_words, glove_word_to_vec_map, data_word_to_vec_map = load_glove_vectors(glove_file)
# print(len(words))
# print(len(word_to_vec_map))
# print(word_to_vec_map['the'])
# print(word_to_vec_map['the'].shape)


In [7]:
unknown_word_vector = np.mean(list(glove_word_to_vec_map.values()), axis=0)

In [6]:
# print(len(data_words))
# print(len(data_word_to_vec_map))
missing_words = len(data_words) - len(data_word_to_vec_map)
print(missing_words)

330


In [8]:
embedding_matrix = []
idx2word = []
word2idx = {}
embedding_matrix.append(np.zeros(300)) # this will be our zero padding for the network
idx2word.append('')
word2idx[''] = 0
for i, (word, emb) in enumerate(data_word_to_vec_map.items()):
    embedding_matrix.append(emb)
    idx2word.append(word)
    word2idx[word] = i + 1
    # word2idx[word] = i
embedding_matrix = np.asarray(embedding_matrix)

In [24]:
x_train = []
for example in sentence_data:
    temp = []
    for word in example.split():
        if word[-1] in ['.',',','!','?']:
            word = word[:-1]
        if word in word2idx:
            temp.append(word2idx[word])
    # if len(temp) == 0:
    #     print(example)
    x_train.append(temp)

ciao bella
just awsome.
buon appetito!
enjoy!!!
bon appetite!!
kudos haru
they're terrific!!
maybe tomorrow ;-)
food-awesome.
such a disappointment...
a real dissapointment.
love al di la
the bestt!
its alright
impressed...
unhygienic


In [26]:
x_train = np.asarray(x_train, dtype=object)

In [27]:
# print(x_train.shape)

(4665,)

In [28]:
max_length = 0
min_length = 1000
for example in x_train:
    if len(example) > max_length:
        max_length = len(example)
    if len(example) < min_length:
        min_length = len(example)

In [29]:
# print(max_length)
# print(min_length)

45
0


In [31]:
total_length = 0
for i in range(len(x_train)):
    total_length += len(x_train[i])
avg_length = total_length / len(x_train)

In [32]:
for i in range(len(x_train)):
    x_train[i] = np.pad(x_train[i], (max_length - len(x_train[i]), 0), 'constant')

In [33]:
x_train_data = []
for x in x_train:
    x_train_data.append([k for k in x])

x_train_data = np.array(x_train_data)


In [36]:
# print(x_train_data.shape)