## Label Hotels or Restaurants

In [12]:
import json
import numpy as np
import gensim

from gensim.models import Word2Vec

#### hotels
# path = '../data/london/' # path = '../data/amsterdam/'
# query_fn = path + 'hotel_queries.txt'
# query_groundtruth_fn = path + 'hotel_query_groundtruth.txt'
# histogram_fn = path + 'entities_with_histograms.json'
# w2v_fn = path + 'word2vec.model'
# idf_fn = path + 'idf.json'

# entities = json.load(open(histogram_fn))
# queries = open(query_fn).read().splitlines()
# query_attr = open(query_groundtruth_fn).read().splitlines()
# model = Word2Vec.load(w2v_fn)
# idf = json.load(open(idf_fn))
# print(len(queries), len(query_attr))
####

#### restaurants
path = '../data/toronto/'
query_fn = path + 'restaurant_queries.txt'
query_groundtruth_fn = path + 'restaurant_query_groundtruth.txt'
histogram_fn_lp = path + 'lp_entities_with_histograms.json'
histogram_fn_jp = path + 'jp_entities_with_histograms.json'
w2v_fn = path + 'word2vec.model'
idf_fn = path + 'idf.json'

entities = json.load(open(histogram_fn_lp))
entities_jp = json.load(open(histogram_fn_jp))
for bid in entities_jp:
    if bid not in entities:
        entities[bid] = entities_jp[bid]

queries = open(query_fn).read().splitlines()
query_attr = open(query_groundtruth_fn).read().splitlines()
model = Word2Vec.load(w2v_fn)
idf = json.load(open(idf_fn))
####

def compress_queries(queries, query_attr):
    used = set([])
    new_queries, new_qas = [], []
    for q, qa in zip(queries, query_attr):
        q = q.lower()
        if q not in used:
            new_queries.append(q)
            new_qas.append(qa)
            used.add(q)
    return new_queries, new_qas

queries, query_attr = compress_queries(queries, query_attr)
print(len(queries), len(query_attr))

165 165


In [13]:
def summary_to_str(summary):
    marker_senti = []
    for marker in summary:
        phrase = marker['phrase'].lower().replace('-', ' ')
        size = marker['size']
        senti = marker['sum_senti'] / size
        marker_senti.append((phrase, size, senti))
    # sort by sentiment
    marker_senti.sort(key=lambda x : -x[2])
    summary_line = ' '.join(['%s : %d' % (ms[0], ms[1]) for ms in marker_senti])

    # some statistics
    sum_size = sum([ms[1] for ms in marker_senti])
    pos_cnt = sum([ms[1] for ms in marker_senti if ms[2] >= 0])
    neg_cnt = sum([ms[1] for ms in marker_senti if ms[2] < 0])
    summary_line += '\t%d\t%d\t%d' % (sum_size, pos_cnt, neg_cnt)
    return summary_line

phrase2vec_cache = {}
def phrase2vec(phrase):
    phrase = phrase.lower().replace('-', ' ')
    if phrase in phrase2vec_cache:
        return phrase2vec_cache[phrase]

    words = gensim.utils.simple_preprocess(phrase)
    res = np.zeros(300)
    # sum pooling
    for w in words:
        if w in model.wv:
            v = model.wv[w]
            res += v * idf[w]

    # normalize
    norm = np.linalg.norm(res)
    if norm > 0:
        res /= norm
    phrase2vec_cache[phrase] = res
    return res


def get_matches(histogram, query):
    # number of exact match
    num_exact_match = 0
    num_appr_match = 0
    vec1 = phrase2vec(query)
    for phrase in histogram:
        vec2 = phrase2vec(phrase)
        sim = np.dot(vec1, vec2)
        if sim >= 0.9:
            num_exact_match += 1
        if sim >= 0.6:
            num_appr_match += 1
    return '\t%d\t%d' % (num_exact_match, num_appr_match)
    
num_lines = 0
with open('to_be_labeled.tsv', 'w') as fout:
    fout.write('bid\tattr\tquery\tsummary\ttotal\tpos_cnt\tneg_cnt\texact_match\tsim_match\n')
    for query, attr in zip(queries, query_attr):
        for bid, entity in entities.items():
            if 'summaries' in entity and attr in entity['summaries'] and \
               'histogram' in entity and attr in entity['histogram']:
                summary = entity['summaries'][attr]
                summary_line = summary_to_str(summary) + get_matches(entity['histogram'][attr], query)
                fout.write('%s\t%s\t%s\t' % (bid, attr, query) + summary_line + '\n')
                num_lines += 1

print(num_lines)

34935


In [14]:
import csv

def read_results_and_dump(input_fn, output_fn):
    with open(input_fn) as fin:
        labels = []
        reader = csv.DictReader(fin)
        for row in reader:
            if row['label'] == '1':
                label = 'yes'
            else:
                label = 'no'
            values = [row['bid'], row['attr'], row['query'], label]
            labels.append(values)
            
    json.dump(labels, open(output_fn, 'w'))

label_fn = '../data/toronto_labeled.csv'
read_results_and_dump(label_fn, path + 'labels.json')

## Evaluate interpreter

In [6]:
import sys
import importlib.util
spec = importlib.util.spec_from_file_location("opine", "../opine.py")
opinedb = importlib.util.module_from_spec(spec)
spec.loader.exec_module(opinedb)


def accuracy(list1, list2):
    N = len(list1)
    TP = sum([list1[i] == list2[i] for i in range(N)])
    return TP / N

# run w2v
def run_w2v():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.interpret(query, fallback_threshold=-1.0)
        vec1 = simple_opine.phrase2vec(query)
        vec2 = simple_opine.phrase2vec(phrase)
        sim = simple_opine.cosine(vec1, vec2)
        # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        attributes.append(attr)

    #for attr in attributes:
    #    print(attr)
    print('w2v acc : ', accuracy(attributes, query_groundtruth))

# run cooc
def run_cooc():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.cooc.interpret(query)
        if attr != None:
            vec1 = simple_opine.phrase2vec(query)
            vec2 = simple_opine.phrase2vec(phrase)
            sim = simple_opine.cosine(vec1, vec2)
            # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        else:
            pass
            # print('%s\t%s' % (query, attr))
        attributes.append(attr)

    # for attr in attributes:
    #     print(attr)
    print('cooc acc : ', accuracy(attributes, query_groundtruth))

# run combined
def run_combined():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.interpret(query, fallback_threshold=0.8)
        vec1 = simple_opine.phrase2vec(query)
        vec2 = simple_opine.phrase2vec(phrase)
        sim = simple_opine.cosine(vec1, vec2)
        # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        attributes.append(attr)

    for attr in attributes:
        print(attr)

    print('combined acc : ', accuracy(attributes, query_groundtruth))

def run_hotel_examples():
    attr, phrase = simple_opine.cooc.interpret("for our anniversary")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)

    attr, phrase = simple_opine.cooc.interpret("multiple eating options")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)

    attr, phrase = simple_opine.cooc.interpret("kid friendly hotel")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)


path = '../data/toronto/'
histogram_fn = path + 'jp_entities_with_histograms.json'
extraction_fn = path + 'jp_restaurant_reviews_with_extractions.json'
sentiment_fn = path + 'sentiment.json'
word2vec_fn = path + 'word2vec.model'
idf_fn = path + 'idf.json'
query_label_fn = path + 'labels.json'
selected_bids = '../data/raw_jp_restaurants.json'
query_path = path + 'restaurant_queries.txt'
query_groundtruth_path = path + 'restaurant_query_groundtruth.txt'


queries = open(query_path).read().splitlines()
query_groundtruth = open(query_groundtruth_path).read().splitlines()
simple_opine = opinedb.SimpleOpine(histogram_fn, extraction_fn, sentiment_fn, word2vec_fn, idf_fn, query_label_fn, selected_bids)

# run interpreters
run_w2v()
run_cooc()
run_combined()

phrase model score = 0.840404
marker model score = 0.838384
w2v acc :  0.745945945945946
cooc acc :  0.654054054054054
crowd
food
food
general
group
menu
vibe
crowd
vibe
staff
vibe
food
vibe
food
general
group
crowd
menu
menu
food
menu
staff
food
menu
vibe
crowd
food
general
vibe
price
general
location
vibe
food
staff
food
vibe
vibe
vibe
price
staff
drink
vibe
vibe
menu
group
menu
food
vibe
vibe
food
vibe
menu
vibe
group
general
vibe
menu
general
menu
menu
menu
food
drink
menu
vibe
drink
staff
vibe
crowd
food
menu
menu
food
staff
crowd
food
general
price
crowd
crowd
menu
vibe
crowd
general
food
vibe
food
food
menu
vibe
price
food
drink
food
staff
general
vibe
food
crowd
menu
food
crowd
table
vibe
general
price
delivery
food
general
food
general
food
food
staff
crowd
price
vibe
vibe
food
crowd
food
food
food
crowd
food
food
price
food
food
vibe
food
food
vibe
delivery
food
general
food
price
vibe
food
crowd
group
vibe
menu
crowd
staff
group
staff
crowd
vibe
vibe
food
menu
vibe
general
c

In [11]:
queries = open(query_path).read().splitlines()
query_groundtruth = open(query_groundtruth_path).read().splitlines()

# def run_cooc():
#     simple_opine.clear_cache()
#     attributes = []
#     for query in queries:
#         attr, phrase = simple_opine.cooc.interpret(query)
#         if attr != None:
#             vec1 = simple_opine.phrase2vec(query)
#             vec2 = simple_opine.phrase2vec(phrase)
#             sim = simple_opine.cosine(vec1, vec2)
#             # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
#         else:
#             pass
#             # print('%s\t%s' % (query, attr))
#         attributes.append(attr)

#     for attr in attributes:
#         print(attr)
#     print('cooc acc : ', accuracy(attributes, query_groundtruth))

run_w2v()
run_cooc()
run_combined()

w2v acc :  0.8432432432432433
cooc acc :  0.6864864864864865
crowd
vibe
food
general
group
menu
vibe
crowd
vibe
staff
vibe
food
crowd
food
general
group
crowd
menu
menu
food
menu
staff
food
menu
vibe
crowd
food
general
location
price
general
location
vibe
food
staff
food
vibe
vibe
vibe
price
staff
drink
crowd
vibe
menu
group
menu
food
vibe
vibe
food
vibe
menu
crowd
group
general
vibe
menu
food
menu
menu
menu
food
drink
menu
vibe
drink
staff
vibe
crowd
food
menu
menu
food
staff
crowd
food
general
price
crowd
crowd
menu
vibe
crowd
general
food
vibe
food
food
menu
vibe
price
price
drink
food
staff
general
vibe
food
crowd
menu
food
crowd
location
vibe
general
price
delivery
food
general
food
general
menu
food
staff
crowd
price
vibe
vibe
food
crowd
food
vibe
food
crowd
food
price
price
food
food
vibe
food
food
vibe
delivery
food
general
food
price
vibe
food
crowd
group
vibe
menu
crowd
staff
group
staff
crowd
vibe
vibe
food
menu
vibe
general
crowd
staff
vibe
food
drink
staff
menu
staff
food


In [62]:
# attr, phrase = simple_opine.cooc.interpret("tasty food")
simple_opine.clear_cache()
query = ''
attr, phrase = simple_opine.cooc.interpret(query, debug=False)
# attr, phrase = simple_opine.interpret(query)
vec1 = simple_opine.phrase2vec(query)
vec2 = simple_opine.phrase2vec(phrase)
sim = simple_opine.cosine(vec1, vec2)
# attr, phrase = simple_opine.interpret("kid friendly")

marker = simple_opine.get_marker(attr, phrase)
print(attr, ':', marker, phrase, sim)

vibe : quiet place romantic dinners 0.8485417098190925


In [82]:
def ext_in_review(ext, text):
    tokens = ext['entity'].split(' ') + ext['predicate'].split(' ')
    for token in tokens:
        if token not in text:
            return False
    return True

for (i, review) in enumerate(reviews):
    text = review['text']
    new_extractions = []
    for ext in review['extractions']:
        if not ext_in_review(ext, text):
            found = False
            for dist in range(1, 20):
                for sign in [1, -1]:
                    if i + dist * sign >= 0 and i + dist * sign < len(reviews) and \
                       ext_in_review(ext, reviews[i + dist * sign]['text']):
                        reviews[i + dist*sign]['extractions'].append(ext)
                        found = True
                        break
                if found:
                    break
        else:
            new_extractions.append(ext)
    review['extractions'] = new_extractions

In [93]:
json.dump(reviews, open(path + 'restaurant_reviews_with_extractions.json', 'w'))

In [5]:
import json

def get_subset(entity_fn, histogram_fn, extraction_fn, output_histogram_fn, output_extraction_fn):
    raw_entities = json.load(open(entity_fn))
    bids = []
    for entity in raw_entities:
        bids.append(entity['business_id'])
        
    entities = json.load(open(histogram_fn))
    entities = { bid : entities[bid] for bid in bids}
    json.dump(entities, open(output_histogram_fn, 'w'))
    
    bids = set(bids)
    reviews = json.load(open(extraction_fn))
    reviews = [review for review in reviews if review['business_id'] in bids]
    json.dump(reviews, open(output_extraction_fn, 'w'))

path = '../data/toronto/'
histogram_fn = path + 'entities_with_histograms.json'
extraction_fn = path + 'restaurant_reviews_with_extractions.json'
output_histogram_fn = path + 'lp_entities_with_histograms.json'
output_extraction_fn = path + 'lp_restaurant_reviews_with_extractions.json'

get_subset('../data/raw_lp_restaurants.json', 
           histogram_fn, 
           extraction_fn, 
           output_histogram_fn, 
           output_extraction_fn)