In [46]:
import gensim
from smart_open import smart_open
import numpy as np
import pandas as pd
import jieba

ANS = ['a', 'b', 'c', 'd', 'e']
WINDOW = 10
VEC_SIZE = 100

jieba.set_dictionary('../data/DICT_CK+jieba_lower')
jieba.add_word('龐燮傍謝', freq=10, tag='xx')

def normalize_vec(vec):
    mag = ((vec * vec).sum()) ** 0.5
    return vec / mag

def estimate_ans(est_sen_list, options, model, syn1neg):
    n_sen = float(len(est_sen_list))
    option_vec_idx = []
    for w in options:
        if w in model: option_vec_idx.append(model.vocab[w].index)
        else: 
            option_vec_idx.append(-1)
#             return -1
    score = [0., 0., 0., 0., 0.]
    for wlist in est_sen_list:
        arr = np.zeros(VEC_SIZE)
        for w in wlist:
            if w in model and w != u'*': arr += model[w]
        for i in range(5):
            if option_vec_idx[i] >= 0: 
#                 score[i] += np.dot(normalize_vec(arr), normalize_vec(model[model.index2word[option_vec_idx[i]]]))
                score[i] += np.dot(normalize_vec(arr), normalize_vec(syn1neg[option_vec_idx[i]]))
#                 score[i] += np.dot(arr, syn1neg[option_vec_idx[i]])
    for i in range(5):
        score[i] /= n_sen
    
    return ANS[score.index(max(score))]


def build_estimate_samples(wlist, qidx):
    temp = wlist[:]
    est_sen = []
    sen_len = len(wlist)
    for i in qidx:
        head = max(i - WINDOW, 0)
        tail = min(i + WINDOW, sen_len)
        est_sen.append(wlist[head : i] + wlist[i + 1 : tail])
    return est_sen


def jieba_preprocess(content):
    content = content.strip().lower().replace('︽⊙＿⊙︽', '龐燮傍謝')
    wlist = list(jieba.cut(content))
    qidx = []
    for i, w in enumerate(wlist):
        if w == '龐燮傍謝':
            wlist[i] = '*'
            qidx.append(i)
    return (wlist, qidx)

def format_pre_sample(row):
    no = row.no
    wlist, qidx = jieba_preprocess(row.content)
    options = [x.strip().lower() for x in list(row[['a','b','c','d','e']].values)]
    ans = row.ans
    return no, wlist, qidx, options, ans

def load_model(path, prefix):
    # input sample:
    # path = 'w2v-experiment/model/'
    # prefix = 'sk'
    model = gensim.models.Word2Vec.load_word2vec_format(path + prefix + '-syn0.bin', binary = True)
    vocab_size, vector_size = model.syn0.shape
    syn1neg = np.zeros((vocab_size, vector_size), dtype=np.float32)
    binary_len = np.dtype(np.float32).itemsize * vector_size
    with smart_open(path + prefix + '-syn1neg.bin') as fin:
        for i in range(vocab_size):
            weights = np.fromstring(fin.read(binary_len), dtype=np.float32)
            syn1neg[i] = weights
    syn1neg[0, :] = syn1neg[1 : vocab_size/2, :].mean(axis=0)
    model.syn0[0, :] = model.syn0[1 : vocab_size/2, :].mean(axis=0)
    return (model, syn1neg)





Building prefix dict from /Users/tripper/Desktop/project/pixnet-nlp/data/DICT_CK+jieba_lower ...
Loading model from cache /var/folders/4_/b_9vfbwx41g4t48jrzr0r4vm0000gn/T/jieba.uaab2b61a375629a86d8bcbcdd9e861b1.cache
Loading model cost 1.268 seconds.
Prefix dict has been built succesfully.


In [8]:
path = 'model/'
cbow_model, cbow_syn1neg = load_model(path, 'cbow')

In [44]:
pre_samples = pd.read_csv('../question_sample/pre/raw_samples.csv')

In [50]:
right = 0
wrong = 0
total = 0
wtf = 0
for index, row in pre_samples.iterrows():
    no, wlist, qidx, options, ans = format_pre_sample(row)
    est_sen_list = build_estimate_samples(wlist, qidx)
    if len(est_sen_list) > 0:
        predict = estimate_ans(est_sen_list, options, cbow_model, cbow_syn1neg)
        if(predict == ans): right += 1
        else: wrong += 1
        total += 1
    else: wtf += 1
    
print(right/total)
print(wrong/total)
print(wtf)

0.7747252747252747
0.22527472527472528
4


In [47]:
post_samples = pd.read_csv('../question_sample/official/hackathon_1000.tsv', sep='\t', names=['no','content', 'ans', 'a', 'b', 'c', 'd', 'e', 'ans_ref', 'url', 'degree'])
post_samples.head()

Unnamed: 0,no,content,ans,a,b,c,d,e,ans_ref,url,degree
0,0,臉上比較油的時候我都會習慣讓︽⊙＿⊙︽多停留個10秒左右，會比馬上就洗掉清潔得更乾淨一點唷,c,金花,蝶舞,洗面乳,蛋糕,石屋,洗面乳,http://uniquevera.pixnet.net/blog/post/32578815,easy
1,1,遊客服務中心除了可以拿地圖、納涼、放水和裝水之外，也有山藥、地瓜、地瓜蛋捲等︽⊙＿⊙︽特產伴...,a,三芝,馬力,土星,帽帽,黃博,三芝,http://bajenny.pixnet.net/blog/post/31640255,easy
2,2,"這瓶︽⊙＿⊙︽我大概聖誕節開始用, 目前用2個禮拜, 剛好都是我皮膚最疲勞狀況最不好的",b,視角,精華液,八甲,調性,香火,精華液,http://ksnancy.pixnet.net/blog/post/405460378,easy
3,3,在白米木屐館的隔壁，大家有去︽⊙＿⊙︽的話，一定要順道去貪小便宜吃好料，才會不枉此行啊～,c,工作者,代理,蘇澳,炒年糕,冰柱,蘇澳,http://k640640.pixnet.net/blog/post/27233228,easy
4,4,※墾丁大街/後壁湖/紅柴坑/墾丁民宿/墾丁飯店/︽⊙＿⊙︽小吃/墾丁景點美食相關遊記,d,小家庭,繪圖,薄紗,恆春,限定版,恆春,http://bajenny.pixnet.net/blog/post/41742487,easy


In [49]:
right = 0
wrong = 0
total = 0
wtf = 0
for index, row in post_samples.iterrows():
    no, wlist, qidx, options, ans = format_pre_sample(row)
    est_sen_list = build_estimate_samples(wlist, qidx)
    if len(est_sen_list) > 0:
        predict = estimate_ans(est_sen_list, options, cbow_model, cbow_syn1neg)
        if(predict == ans): right += 1
        else: wrong += 1
        total += 1
    else: wtf += 1
    
print(right/total)
print(wrong/total)
print(wtf)

0.7841269841269841
0.21587301587301588
0
