In [20]:
!pip install numpy

Collecting numpy
  Using cached numpy-1.21.4-cp39-cp39-macosx_10_9_x86_64.whl (17.0 MB)
Installing collected packages: numpy
Successfully installed numpy-1.21.4


In [23]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
#from nltk.tokenize import word_tokenize, sent_tokenize
# from gensim import corpora
# from gensim.models.ldamodel import LdaModel
import json, nltk, pickle, ast
#nltk.download('punkt')

### Amazon review dataset specification

> data[0] <br>
    {<br>
    'asin': '0528881469',<br>
     'helpful': [0, 0],<br>
     'overall': 5.0,<br>
     'reviewText': 'We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the "trucker" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that\'s just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!',<br>
     'reviewTime': '06 2, 2013',<br>
     'reviewerID': 'AO94DHGC771SJ',<br>
     'reviewerName': 'amazdnu',<br>
     'summary': 'Gotta have GPS!',<br>
     'unixReviewTime': 1370131200<br>
    }

### Construct dataset with LDA model

In [77]:
dataset_name = 'electronics'
file_name = '{}_simple_json.json'.format(dataset_name)
data = json.load(open('./raw_data/{}'.format(file_name), 'r'))

num_aspects = 30
num_top_words = 5
random_state_lda = 42
save_dir = './data/{}/with_lda/'.format(dataset_name)

users = list(set([ r['reviewerID'] for r in data ]))
items = list(set([ r['asin'] for r in data ]))

'''
    :Tokenize reviews into lists of sentences (which are lists of tokens)`
    :param sentences - sentences from a review
    :return 2d lists (of sentences of tokens)
'''
def tokenize(reviews):
    tokenized_ss = []
    # make reviews lowercase, store tokens per sentence
    for r in data:
        r_text = r['reviewText']
        r_user = r['reviewerID']
        r_item = r['asin']

        sentences = sent_tokenize(r_text.lower()) 
    for s in sentences:
        tokenized_s = word_tokenize(s)
        tokenized_ss.append(['<sos>'] + tokenized_s + ['<eos>'])
    
    return tokenized_ss

'''
    :Extract topics with LDA
    :param - data (amazon review dataset)
    :return topic_info (e.g., [(TOPIC-IDX-0, [('early', 0.0029411765), ('apps', 0.0029411765), ... ]), (TOPIC-IDX-1, ...), ...]
'''
def extract_topics(data):
    tokenized_ss = tokenize(data)
    
    # turn it into frequency matrix
    idx2w_dict = corpora.Dictionary(tokenized_ss) # e.g., { ..., 980: 'pleased', 981: 'purpose', ...}
    w2idx_dict = idx2w_dict.token2id
    s2w_matrix = [idx2w_dict.doc2bow(s_tokens) for s_tokens in tokenized_ss]

    ldamodel = LdaModel(s2w_matrix, num_topics = num_aspects, id2word=idx2w_dict, passes=15)
    
    # For each sentence & each word in sentences, get topic probability of sentences and words
    for tokenized_s in tokenized_ss:
        doc_topics, word_topics, _ = ldamodel.get_document_topics(idx2w_dict.doc2bow(tokenized_s), per_word_topics=True)

    # For each topic, get the top-100 words
    topic_info = ldamodel.show_topics(num_topics=12, num_words=100, formatted=False)
    
    return topic_info

'''
    :Build topic2idx, idx2topic
'''
def build_topic2idx_idx2topic_from_topic(topics):
    topic2words = {}
    word2tIdx = {}  # for topic2idx
    tIdx2word = {} # for idx2topic
    
    topic2words = { t_info[0]:[ t_words[0] for t_words in t_info[1]] for t_info in topics }
    for t_id, words in topic2words.items():
        for word in words:
            if word not in word2tIdx.keys():
                word2tIdx[word] = []
            word2tIdx[word] = t_id

    tIdx2word = { t_id:w for w, t_id in zip(word2tIdx.keys(), word2tIdx.values()) }
    
    return word2tIdx, tIdx2word

topics = extract_topics(data)
word2tIdx, tIdx2word = build_topic2idx_idx2topic_from_topic(topics)

user2idx = {}
item2idx = {}

# users to their idx
u_ids = list(set([ r['reviewerID'] for r in data ]))
user2idx = { u_id:idx for idx, u_id in enumerate(u_ids) }
user2idx_rev = { v:k for k, v in user2idx.items() }

i_ids = list(set([ r['asin'] for r in data ]))
item2idx = { i_id:idx for idx, i_id in enumerate(i_ids) }
item2idx_rev = { idx:i_id for idx, i_id in enumerate(i_ids) }

print('save_dir: {}'.format(save_dir))
pickle.dump(word2tIdx, open(save_dir + 'aspect2idx.pkl', 'wb'))
pickle.dump(tIdx2word, open(save_dir + 'idx2aspect.pkl', 'wb'))
pickle.dump(user2idx, open(save_dir + 'user2idx.pkl', 'wb'))
pickle.dump(user2idx_rev, open(save_dir + 'idx2user.pkl', 'wb'))
pickle.dump(item2idx, open(save_dir + 'item2idx.pkl', 'wb'))
pickle.dump(item2idx_rev, open(save_dir + 'idx2item.pkl', 'wb'))

save_dir: ./data/electronics/with_lda/


### Pre-processing data from extracted aspects

In [82]:
dataset_name = 'electronics'
file_name = '{}_with_aspects.pkl'.format(dataset_name)
data = pickle.load(open('./raw_data/{}'.format(file_name), 'rb'))
save_dir = './data/{}/'.format(dataset_name)

# Go over each review
'''
[{'rating': 4,
  'item': 'B000HJ75MK',
  'user': 'A37021SSH9LGY6',
  'sentence': [('mouse',
    'light',
    'lightweight mouse to take on a trip overseas',
    1)],
  'text': 'Mighty little mouse\nI ... '
  }, ... 
]
# sentence: [(ASPECT, SENTIMENT-WORD, RELATED-SENTENCE, SENTIMENT-SCORE), ...]
'''

df_data = pd.DataFrame(data)
df_data_flatten = df_data.explode('sentence')

# Extract aspects from a['sentence'][0]
aspects = set([ a['sentence'][0] if not pd.isna(a['sentence']) else None for i, a in df_data_flatten.iterrows() ])

aspect2idx = { a:idx for idx, a in enumerate(aspects) }
idx2aspect = { idx:a for idx, a in enumerate(aspects) }

u_ids = list(df_data['user'].unique())
user2idx = { u:idx for idx, u in enumerate(u_ids)}
idx2user = { idx:u for idx, u in enumerate(u_ids)}
             
i_ids = list(df_data['item'].unique())
item2idx = { i:idx for idx, i in enumerate(i_ids)}
idx2item = { idx:i for idx, i in enumerate(i_ids)}

print('save_dir: {}'.format(save_dir))
pickle.dump(aspect2idx, open(save_dir + 'aspect2idx.pkl', 'wb'))
pickle.dump(idx2aspect, open(save_dir + 'idx2aspect.pkl', 'wb'))
pickle.dump(user2idx, open(save_dir + 'user2idx.pkl', 'wb'))
pickle.dump(idx2user, open(save_dir + 'idx2user.pkl', 'wb'))
pickle.dump(item2idx, open(save_dir + 'item2idx.pkl', 'wb'))
pickle.dump(idx2item, open(save_dir + 'idx2item.pkl', 'wb'))


save_dir: ./data/electronics/


In [75]:
u_ids = list(df_data['user'].unique())
user2idx = { u:i for i, u in enumerate(u_ids)}
idx2user = { i:u for i, u in enumerate(u_ids)}



82753

In [80]:
df_data[df_data['item'].str.contains('B002TA7VO2')]

Unnamed: 0,text,rating,user,sentence,item
35700,Great SD card-FAST!\nPurchased one of these la...,5,ALM9R3A4ZGFHC,"[(cost, next, Wonder what a 32G card will cost...",B002TA7VO2
35701,For my wifes Canon camera\nShe loves it and us...,3,A3VL6OOZVAA9OR,,B002TA7VO2
35702,Works fine on Windows 7 and in my MP3 player\n...,5,A1DQIKMPS16H54,"[(card, perfect, For this kind of purpose the ...",B002TA7VO2
35703,Works very well.\nThis item works very well in...,5,ASD69N2P9WRSU,,B002TA7VO2
35704,Does the expected.\nDoes the expected. Writes ...,3,A1KMB1WVG8QM8H,,B002TA7VO2
35705,"Not the fastest, but surely a good price for t...",5,AVBHYYARHGNKF,"[(price, good, but surely a good price for the...",B002TA7VO2
35706,It's SDHC memory\nIts SDHC memory. Bought it t...,5,A21G8XF090VXN4,"[(camera, digital, Bought it to go in a digita...",B002TA7VO2


### Pre-processing user, item.pkl

In [72]:
### constructing user2idx, item2idx
### (user/item to ids of tokens from their reviews)
# for r in data:
#     r_text = r['reviewText']
#     r_user = r['reviewerID']
#     r_item = r['asin']
#     tokenized_r = word_tokenize(r_text.lower())
    
#     if r_user not in user2idx.keys():
#         user2idx[r_user] = []
        
#     if r_item not in item2idx.keys():
#         item2idx[r_item] = []
    
#     for token in tokenized_r:
#         w_idx = w2idx_dict[token]
#         user2idx[r_user].append(w_idx)
#         item2idx[r_item].append(w_idx)


# # only contains unique set of words by removing duplicates
# for u_id in list(user2idx.keys()):
#     unique_w_ids = list(set(user2idx[u_id]))
#     if len(unique_w_ids) < num_top_words:
#         del user2idx[u_id]
#         print('user {} deleted'.format(u_id))
#     else:
#         user2idx[u_id] = unique_w_ids[:num_top_words]
    
# for i_id in list(item2idx.keys()):
#     unique_w_ids = list(set(item2idx[i_id]))
#     if len(unique_w_ids) < num_top_words:
#         del item2idx[i_id]
#         print('item {} deleted'.format(i_id))
#     else:
#         item2idx[i_id] = unique_w_ids[:num_top_words]
        
# pickle.dump(user2idx, open(save_dir + 'user.pkl', 'wb'))
# pickle.dump(item2idx, open(save_dir + 'item.pkl', 'wb'))

In [47]:
'''
    a u/i review -- a sequence of words which are assigned to one of topics each
    // e.g., 
    "topic.pkl": topic2idx dictionary, including <sos>, <eos>, <unk>, <pad>, and topic labels. 
    // {TOPIC-IDX: (TOPIC-WORD-IDX1, TOPIC-WORD-IDX2, ...)} e.g., {0: (1,1002,1123, ...), }
    "topic_rev.pkl": idx2topic dictionary, the reverse of topic2idx
    "user.pkl" and "item.pkl": user2idx and item2idx dictionary
'''

SyntaxError: invalid syntax (<ipython-input-47-661e3c02d3e6>, line 10)

### Split train/val/test dataset

In [91]:
from sklearn.model_selection import train_test_split
random_state = 42
file_to_split = 'electronics_with_aspects'

# Load original review dataset (100k sampled)
# Dataset is not really jsonized one -- json elements are line-separated
# with open('./raw_data/{}.json'.format(file_to_split), 'r') as f:
#     all_data = [ eval(l) for l in f.readlines() ]
    
# Load files for extracted aspects
all_data = json.load(open('./raw_data/{}.json'.format(file_to_split), 'r'))
    
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

x_train, x_test = train_test_split(all_data, test_size=1 - train_ratio, random_state=random_state)
x_val, x_test = train_test_split(x_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=random_state)

# Save splitted train/test/valid jsons back to file
with open('./data/electronics/train.json', 'w') as f:
    for r_dict in x_train:
        f.write(json.dumps(r_dict))
        f.write('\n')
    f.close()
    
with open('./data/electronics/test.json', 'w') as f:
    for r_dict in x_test:
        f.write(json.dumps(r_dict))
        f.write('\n')
    f.close()
    
with open('./data/electronics/validation.json', 'w') as f:
    for r_dict in x_val:
        f.write(json.dumps(r_dict))
        f.write('\n')
    f.close()

In [87]:
all_data = json.load(open('./raw_data/{}.json'.format(file_to_split), 'r'))

In [89]:
len(all_data)

118312