## Label Hotels

In [168]:
import json
import numpy as np
import gensim

from gensim.models import Word2Vec

path = '../data/london/' # path = '../data/amsterdam/'
query_fn = path + 'hotel_queries.txt'
query_groundtruth_fn = path + 'hotel_query_groundtruth.txt'
histogram_fn = path + 'entities_with_histograms.json'
w2v_fn = path + 'word2vec.model'
idf_fn = path + 'idf.json'

entities = json.load(open(histogram_fn))
queries = open(query_fn).read().splitlines()
query_attr = open(query_groundtruth_fn).read().splitlines()
model = Word2Vec.load(w2v_fn)
idf = json.load(open(idf_fn))

print(len(queries), len(query_attr))

def compress_queries(queries, query_attr):
    used = set([])
    new_queries, new_qas = [], []
    for q, qa in zip(queries, query_attr):
        q = q.lower()
        if q not in used:
            new_queries.append(q)
            new_qas.append(qa)
            used.add(q)
    return new_queries, new_qas

queries, query_attr = compress_queries(queries, query_attr)
print(len(queries), len(query_attr))

190 190
154 154


In [169]:
def summary_to_str(summary):
    marker_senti = []
    for marker in summary:
        phrase = marker['phrase'].lower().replace('-', ' ')
        size = marker['size']
        senti = marker['sum_senti'] / size
        marker_senti.append((phrase, size, senti))
    # sort by sentiment
    marker_senti.sort(key=lambda x : -x[2])
    summary_line = ' '.join(['%s : %d' % (ms[0], ms[1]) for ms in marker_senti])

    # some statistics
    sum_size = sum([ms[1] for ms in marker_senti])
    pos_cnt = sum([ms[1] for ms in marker_senti if ms[2] >= 0])
    neg_cnt = sum([ms[1] for ms in marker_senti if ms[2] < 0])
    summary_line += '\t%d\t%d\t%d' % (sum_size, pos_cnt, neg_cnt)
    return summary_line

phrase2vec_cache = {}
def phrase2vec(phrase):
    phrase = phrase.lower().replace('-', ' ')
    if phrase in phrase2vec_cache:
        return phrase2vec_cache[phrase]

    words = gensim.utils.simple_preprocess(phrase)
    res = np.zeros(300)
    # sum pooling
    for w in words:
        if w in model.wv:
            v = model.wv[w]
            res += v * idf[w]

    # normalize
    norm = np.linalg.norm(res)
    if norm > 0:
        res /= norm
    phrase2vec_cache[phrase] = res
    return res


def get_matches(histogram, query):
    # number of exact match
    num_exact_match = 0
    num_appr_match = 0
    vec1 = phrase2vec(query)
    for phrase in histogram:
        vec2 = phrase2vec(phrase)
        sim = np.dot(vec1, vec2)
        if sim >= 0.9:
            num_exact_match += 1
        if sim >= 0.6:
            num_appr_match += 1
    return '\t%d\t%d' % (num_exact_match, num_appr_match)
    
num_lines = 0
with open('to_be_labeled.tsv', 'w') as fout:
    fout.write('bid\tattr\tquery\tsummary\ttotal\tpos_cnt\tneg_cnt\texact_match\tsim_match\n')
    for query, attr in zip(queries, query_attr):
        for bid, entity in entities.items():
            if 'summaries' in entity and attr in entity['summaries'] and \
               'histogram' in entity and attr in entity['histogram']:
                summary = entity['summaries'][attr]
                summary_line = summary_to_str(summary) + get_matches(entity['histogram'][attr], query)
                fout.write('%s\t%s\t%s\t' % (bid, attr, query) + summary_line + '\n')
                num_lines += 1

print(num_lines)

29100


In [170]:
import csv

def read_results_and_dump(input_fn, output_fn):
    with open(input_fn) as fin:
        labels = []
        reader = csv.DictReader(fin)
        for row in reader:
            if row['label'] == '1':
                label = 'yes'
            else:
                label = 'no'
            values = [row['bid'], row['attr'], row['query'], label]
            labels.append(values)
            
    json.dump(labels, open(output_fn, 'w'))

label_fn = '../data/london_labeled.csv'
read_results_and_dump(label_fn, path + 'labels.json')

## Evaluate interpreter

In [213]:
import sys
import importlib.util
spec = importlib.util.spec_from_file_location("opine", "../opine.py")
opinedb = importlib.util.module_from_spec(spec)
spec.loader.exec_module(opinedb)


def accuracy(list1, list2):
    N = len(list1)
    TP = sum([list1[i] == list2[i] for i in range(N)])
    return TP / N

# run w2v
def run_w2v():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.interpret(query, fallback_threshold=-1.0)
        vec1 = simple_opine.phrase2vec(query)
        vec2 = simple_opine.phrase2vec(phrase)
        sim = simple_opine.cosine(vec1, vec2)
        # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        attributes.append(attr)

    #for attr in attributes:
    #    print(attr)
    print('w2v acc : ', accuracy(attributes, query_groundtruth))

# run cooc
def run_cooc():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.cooc.interpret(query)
        if attr != None:
            vec1 = simple_opine.phrase2vec(query)
            vec2 = simple_opine.phrase2vec(phrase)
            sim = simple_opine.cosine(vec1, vec2)
            # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        else:
            pass
            # print('%s\t%s' % (query, attr))
        attributes.append(attr)

    # for attr in attributes:
    #     print(attr)
    print('cooc acc : ', accuracy(attributes, query_groundtruth))

# run combined
def run_combined():
    simple_opine.clear_cache()
    attributes = []
    for query in queries:
        attr, phrase = simple_opine.interpret(query, fallback_threshold=0.8)
        vec1 = simple_opine.phrase2vec(query)
        vec2 = simple_opine.phrase2vec(phrase)
        sim = simple_opine.cosine(vec1, vec2)
        # print('%s\t%s\t%s\t%f' % (query, attr, phrase, sim))
        attributes.append(attr)

    for attr in attributes:
        print(attr)

    print('combined acc : ', accuracy(attributes, query_groundtruth))

def run_hotel_examples():
    attr, phrase = simple_opine.cooc.interpret("for our anniversary")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)

    attr, phrase = simple_opine.cooc.interpret("multiple eating options")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)

    attr, phrase = simple_opine.cooc.interpret("kid friendly hotel")
    marker = simple_opine.get_marker(attr, phrase)
    print(attr, ':', marker)


path = '../data/toronto/'
histogram_fn = path + 'small_entities_with_histograms.json'
extraction_fn = path + 'small_restaurant_reviews_with_extractions.json'
sentiment_fn = path + 'sentiment.json'
word2vec_fn = path + 'word2vec.model'
idf_fn = path + 'idf.json'
query_label_fn = path + 'labels.json'
selected_bids = '../data/raw_jp_restaurants.json'
query_path = path + 'restaurant_queries.txt'
query_groundtruth_path = path + 'restaurant_query_groundtruth.txt'


queries = open(query_path).read().splitlines()
query_groundtruth = open(query_groundtruth_path).read().splitlines()
simple_opine = opinedb.SimpleOpine(histogram_fn, extraction_fn, sentiment_fn, word2vec_fn, idf_fn, query_label_fn, selected_bids)

# run interpreters
run_w2v()
run_cooc()
run_combined()

phrase model score = 0.840404
marker model score = 0.775758
w2v acc :  0.07567567567567568
cooc acc :  0.07567567567567568
general
general
food
drink
general
food
vibe
crowd
vibe
staff
vibe
food
vibe
food
general
general
crowd
menu
food
price
menu
staff
food
menu
vibe
crowd
food
food
price
price
food
vibe
vibe
menu
staff
food
vibe
vibe
delivery
vibe
menu
drink
portion
vibe
menu
group
menu
food
vibe
vibe
food
general
menu
vibe
general
food
food
menu
general
menu
menu
menu
food
drink
general
vibe
drink
staff
vibe
crowd
food
menu
menu
food
food
staff
food
food
price
menu
vibe
table
general
crowd
delivery
vibe
food
food
food
menu
vibe
price
food
drink
vibe
staff
food
general
staff
crowd
menu
vibe
crowd
food
vibe
food
price
delivery
menu
general
food
general
staff
food
staff
crowd
price
table
vibe
food
staff
menu
vibe
food
crowd
food
food
price
food
food
vibe
food
food
vibe
delivery
food
vibe
food
food
staff
food
crowd
general
vibe
general
crowd
staff
group
staff
crowd
table
vibe
group
menu

In [217]:
queries = open(query_path).read().splitlines()
query_groundtruth = open(query_groundtruth_path).read().splitlines()
run_w2v()
run_cooc()
run_combined()

w2v acc :  0.6810810810810811
cooc acc :  0.5027027027027027
general
general
food
drink
general
food
vibe
crowd
vibe
staff
vibe
food
vibe
food
general
general
crowd
menu
food
price
menu
staff
food
menu
vibe
crowd
food
food
price
price
food
vibe
vibe
menu
staff
food
vibe
vibe
delivery
vibe
menu
drink
portion
vibe
menu
group
menu
food
vibe
vibe
food
general
menu
vibe
general
food
food
menu
general
menu
menu
menu
food
drink
general
vibe
drink
staff
vibe
crowd
food
menu
menu
food
food
staff
food
food
price
menu
vibe
table
general
crowd
delivery
vibe
food
food
food
menu
vibe
price
food
drink
vibe
staff
food
general
staff
crowd
menu
vibe
crowd
food
vibe
food
price
delivery
menu
general
food
general
staff
food
staff
crowd
price
table
vibe
food
staff
menu
vibe
food
crowd
food
food
price
food
food
vibe
food
food
vibe
delivery
food
vibe
food
food
staff
food
crowd
general
vibe
general
crowd
staff
group
staff
crowd
table
vibe
group
menu
food
general
crowd
staff
vibe
food
food
staff
menu
staff
vibe

In [251]:
# attr, phrase = simple_opine.cooc.interpret("tasty food")
simple_opine.clear_cache()
query = 'Vegan'
attr, phrase = simple_opine.interpret(query, fallback_threshold=-1.0)
vec1 = simple_opine.phrase2vec(query)
vec2 = simple_opine.phrase2vec(phrase)
sim = simple_opine.cosine(vec1, vec2)
# attr, phrase = simple_opine.interpret("kid friendly")

marker = simple_opine.get_marker(attr, phrase)
print(attr, ':', marker, phrase, sim)

menu : different menu different menu options 0.29660155259061627


In [252]:
import json
json.dump(simple_opine.entities, open(path + 'small_entities_with_histograms.json', 'w'))

In [254]:
json.dump(simple_opine.reviews, open(path + 'small_restaurant_reviews_with_extractions.json', 'w'))

In [None]:
simple_opine.idf['find']

In [187]:
vec1 = simple_opine.model['good'] * simple_opine.idf['good'] + simple_opine.model['location'] * simple_opine.idf['location']
vec2 = simple_opine.model['great'] * simple_opine.idf['great'] + simple_opine.model['location'] * simple_opine.idf['location']

vec1 /= np.linalg.norm(vec1)
vec2 /= np.linalg.norm(vec2)

print(np.dot(vec1, vec2))

0.9752834


  """Entry point for launching an IPython kernel.
  


In [246]:
simple_opine.model.wv.most_similar('pizza')

  if np.issubdtype(vec.dtype, np.int):


[('pizzas', 0.6918311715126038),
 ('pepperoni', 0.664528489112854),
 ('margherita', 0.6505460143089294),
 ('truff', 0.5642903447151184),
 ('napoli', 0.5414730310440063),
 ('pizzeria', 0.5330373644828796),
 ('crust', 0.5302197933197021),
 ('flatbread', 0.5285623669624329),
 ('funghi', 0.525039792060852),
 ('quattro', 0.5236132144927979)]