In [12]:
# -*- coding: utf-8 -*-
%matplotlib inline
import re, jieba
import gensim
import difflib
import numpy as np
import random
import opencc
from smart_open import smart_open
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
ANS = ['a', 'b', 'c', 'd', 'e']
FEATURES = ['no', 'w_idx', 'word', 'cos_ref','cos_syn1', 'cos_syn0', 'dist_syn0', 'target']
WINDOW = 10
VEC_SIZE = 100
def simple_preprocess(content):
    content = content.strip().replace('︽⊙＿⊙︽', '龐燮傍謝')
    wlist = list(jieba.cut(content))
    qidx = []
    i = 0
    for w in wlist:
        if w == '龐燮傍謝':
            wlist[i] = '*'
            qidx.append(i)
        i += 1
    return (wlist, qidx)

def normalize_vec(vec):
    mag = ((vec * vec).sum()) ** 0.5
    return vec / mag

def build_estimate_samples(wlist, qidx):
    global WINDOW
    temp = wlist[:]
    est_sen = []
    sen_len = len(wlist)
    for i in qidx:
        head = max(i - WINDOW, 0)
        tail = min(i + WINDOW, sen_len)
        est_sen.append(wlist[head : i] + wlist[i + 1 : tail])
    return est_sen

def estimate_ans(est_sen_list, options, model, syn1neg):
    global VEC_SIZE
    n_sen = float(len(est_sen_list))
    option_vec_idx = []
    for w in options:
        if w in model: option_vec_idx.append(model.vocab[w].index)
        else: 
            option_vec_idx.append(-1)
            return -1
    score = [0., 0., 0., 0., 0.]
    for wlist in est_sen_list:
        arr = np.zeros(VEC_SIZE)
        for w in wlist:
            if w in model and w != u'*': arr += model[w]
        for i in range(5):
            if option_vec_idx[i] >= 0: 
#                 score[i] += np.dot(normalize_vec(arr), normalize_vec(model[model.index2word[option_vec_idx[i]]]))
                score[i] += np.dot(normalize_vec(arr), normalize_vec(syn1neg[option_vec_idx[i]]))
#                 score[i] += np.dot(arr, syn1neg[option_vec_idx[i]])
    
    for i in range(5):
        score[i] /= n_sen
    
    return ANS[score.index(max(score))]
#     np.dot(arr, syn1neg[idx])


def generate_feature(no, w_list, opt_list, ans, syn0_model, syn1, prefix):
    # input sample: 
    # w_list = ['高雄','转','144','次','自强号','1700','高雄','开','1923','到','1940','到','台北']
    # opt_list = ['两用', '阿明', '员林', '碎屑', '精力']
    # ans = 'c'
    # prefix = 'cbow'
    opt_num = len(opt_list)
    ans = opt_list[ANS.index(ans)]  # ans: 'c' --> '员林'
    hidd_vec = np.zeros(VEC_SIZE)
    for w in w_list:
        if w in syn0_model and w != u'*': hidd_vec += syn0_model[w]
    feats = []
    for w in opt_list:
        if w in syn0_model: 
            w_idx = syn0_model.vocab[w].index
            cos_ref = np.dot(syn0_model[w], syn1[w_idx])
#             cos_syn1 = np.dot(hidd_vec, syn1[w_idx])
            cos_syn1 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn1[w_idx]))
#             cos_syn0 = np.dot(hidd_vec, syn0_model[w])
            cos_syn0 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn0_model[w]))
            dist_syn0 = sum((hidd_vec - syn0_model[w]) ** 2)
            if w == ans:
                feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 1])
            else:
                feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 0])
        else:
            pass
    df = pd.DataFrame(feats, columns=[prefix + '_' + f for f in FEATURES])
    cols_to_norm = [prefix + '_' + f for f in FEATURES[4:-1]]
    df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].mean()) / df[cols_to_norm].std()
#     df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].min()) / (df[cols_to_norm].max() - df[cols_to_norm].min())
    return df


def load_model(path, prefix):
    # input sample:
    # path = 'w2v-experiment/model/'
    # prefix = 'sk'
    model = gensim.models.Word2Vec.load_word2vec_format(path + prefix + '-syn0.bin', binary = True)
    vocab_size, vector_size = model.syn0.shape
    syn1neg = np.zeros((vocab_size, vector_size), dtype=np.float32)
    binary_len = np.dtype(np.float32).itemsize * vector_size
    with smart_open(path + prefix + '-syn1neg.bin') as fin:
        for i in range(vocab_size):
            weights = np.fromstring(fin.read(binary_len), dtype=np.float32)
            syn1neg[i] = weights
    return (model, syn1neg)

def load_sample(nth_line):
    with open('question_official/hackathon_1000_cn.tsv', 'rb') as f:
        for i in range(nth_line): next(f)
        for line in f:
            s = line.decode('utf-8')
            parse = s.split('\t')
            no = int(parse[0])
            content = parse[1].strip()
            ans = parse[2]
            opt_list = parse[3 : 8]
            ans_ref, url, level = parse[8], parse[9], parse[10]
            wlist, qidx = simple_preprocess(content)
            sen_list = build_estimate_samples(wlist, qidx)
            break
        return (sen_list, opt_list, ans)

def sum_word_vec(w_list, syn0_model):
    hidd_vec = np.zeros(VEC_SIZE)
    for w in w_list:
        if w in syn0_model and w != u'*': hidd_vec += syn0_model[w]
    return hidd_vec

In [2]:
# jieba.add_word('龐燮傍謝', freq=10, tag='xx')
# jieba.load_userdict('data/zhwiki-cn-clean')
# jieba.load_userdict('data/dict-txt-big')
# path = 'w2v-experiment/model/'
# cbow_model, cbow_syn1neg = load_model(path, 'cbow')
# sk_model, sk_syn1neg = load_model(path, 'sk')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4_/b_9vfbwx41g4t48jrzr0r4vm0000gn/T/jieba.cache
Loading model cost 1.077 seconds.
Prefix dict has been built succesfully.


In [13]:
overall_df = pd.DataFrame()
with open('question_official/hackathon_1000_cn.tsv', 'rb') as f:
    for line in f:
        s = line.decode('utf-8')
        parse = s.split('\t')
        no = int(parse[0])
        content = parse[1].strip()
        ans = parse[2]
        opt_list = parse[3 : 8]
        ans_ref, url, level = parse[8], parse[9], parse[10]
        wlist, qidx = simple_preprocess(content)
        sen_list = build_estimate_samples(wlist, qidx)
        
        for w_list in sen_list:
            cbow_df = generate_feature(no, w_list, opt_list, ans, cbow_model, cbow_syn1neg, 'cbow')
            cbow_df.drop(['cbow_target', 'cbow_no', 'cbow_word'], axis=1, inplace=True)
            cbow_df.rename(columns = {'cbow_w_idx':'w_idx'}, inplace = True)
            sk_df = generate_feature(no, w_list, opt_list, ans, sk_model, sk_syn1neg, 'sk')
            sk_df.drop(['sk_w_idx'], axis=1, inplace=True)
            sk_df.rename(columns = {'sk_target':'target'}, inplace = True)
            df = pd.concat([cbow_df, sk_df], axis = 1)
            overall_df = pd.concat([overall_df, df], axis = 0, ignore_index=True)

print(overall_df.shape)


(5726, 12)


In [23]:
sen_list, opt_list, ans = load_sample(994)
print(sen_list[0])
print(opt_list)



['虽然', '青年', '活动中心', '的', '建筑', '光影', '很漂亮', '，', '不过', '我们']
['暮色', '广宣', '天祥', '原址', '步道']


In [29]:
w = '步道'
w_idx = cbow_model.vocab[w].index
vec = sum_word_vec(sen_list[0], cbow_model)
print(cbow_model[w][:20])
print(vec[:20])
print(sum(abs(vec - cbow_model[w])))

[-0.75660098  0.44147068  0.0935495   0.05515734  0.58966267 -0.1954944
  0.23016895  0.0775224   0.67222166  0.60646921  0.42620423  0.05359171
 -0.35594633 -0.00262699 -0.39729723  0.31234422 -0.2772193   0.34083471
 -0.57637209  0.17717554]
[-0.7013362   0.3646695   0.17631839 -1.32972857  0.93278129  1.15400534
  0.82759415  0.8628505   1.23306865  1.44381509  0.46014024  0.06557306
 -1.84071061  0.12600059 -0.01049444  0.09357804 -0.6063582   1.00286718
 -0.50865764 -0.60646717]
74.351392936


In [14]:
# overall_df.tail(30)
overall_df.to_csv('question_official/overall_df.csv', index = False)

In [None]:
right, wrong = 0, 0
wrong_sen = []
with open('question_official/hackathon_1000_cn.tsv', 'rb') as f:
#     for i in range(200): next(f)
    for line in f:
#         s = opencc.convert(line.decode('utf-8'), config = '/usr/share/opencc/tw2s.json')
        s = line.decode('utf-8')
#         print(repr(s))
        parse = s.split('\t')
#         print(parse)
        no = int(parse[0])
        content = parse[1].strip()
        ans = parse[2]
        options = parse[3 : 8]
        ans_ref = parse[8]
        url = parse[9]
        level = parse[10]
#         print(content)
#         print(options, ans_ref)
        wlist, qidx = simple_preprocess(content)
        est_sen = build_estimate_samples(wlist, qidx)
        pred = estimate_ans(est_sen, options, sk_model, sk_syn1neg)
        if pred == -1:
#             wrong += 1
            continue
        if pred == ans: right += 1
        else: 
            wrong += 1
            wrong_sen.append((content, options[ANS.index(pred)], ans_ref))
#         break
print(float(right)/float(right+wrong))


#  (['脸上', '比较', '油', '的', '时候', '我', '都', '会', '习惯', '让', '*', '多', '停留', '个', '10', 
#    '秒左右', '，', '会', '比', '马上', '就', '洗掉', '清洁', '得', '更', '干净', '一点', '唷'], [10])


### v100w10n50p10i1 : 0.764065335753176
### v100w10n50p10i10 : 0.7985480943738656
### v100w10n50p10i30 : 0.8076225045372051
### v100w20n50p10i10 : 0.79491833030853
### v80w5n50p10i30 : 0.7531760435571688
### v200w10n50p10i10 : 0.8148820326678766
### v200w10n50p10i50 : 0.7912885662431942
### v200w10n50p10i30 : 0.7586206896551724
### v100w10n50p10i10hs1 : 0.7967332123411979
### v100w10n50p1i10 : 0.79491833030853

### cbow: 0.8137404580152672
### skgram: 0.7633587786259542

In [None]:
overall_df = pd.DataFrame()
with open('question_official/hackathon_1000_cn.tsv', 'rb') as f:
    for line in f:
        s = line.decode('utf-8')
        parse = s.split('\t')
        no = int(parse[0])
        content = parse[1].strip()
        ans = parse[2]
        opt_list = parse[3 : 8]
        ans_ref, url, level = parse[8], parse[9], parse[10]
        wlist, qidx = simple_preprocess(content)
        sen_list = build_estimate_samples(wlist, qidx)
        
        for w_list in sen_list:
            cbow_df = generate_feature(no, w_list, opt_list, ans, cbow_model, cbow_syn1neg, 'cbow')
            cbow_df.drop(['cbow_target', 'cbow_no', 'cbow_word'], axis=1, inplace=True)
            cbow_df.rename(columns = {'cbow_w_idx':'w_idx'}, inplace = True)
            sk_df = generate_feature(no, w_list, opt_list, ans, sk_model, sk_syn1neg, 'sk')
            sk_df.drop(['sk_w_idx'], axis=1, inplace=True)
            sk_df.rename(columns = {'sk_target':'target'}, inplace = True)
            df = pd.concat([cbow_df, sk_df], axis = 1)
            overall_df = pd.concat([overall_df, df], axis = 0, ignore_index=True)

print(overall_df.shape)

# debug = pd.DataFrame()
# w_list = ['高雄','转','144','次','自强号','1700','高雄','开','1923','到','1940','到','台北']
# opt_list = ['两用', '阿明', '员林', '碎屑', '精力']
# ans = 'c'
# cbow_df = generate_feature(w_list, opt_list, ans, cbow_model, cbow_syn1neg, 'cbow')
# cbow_df.drop(['cbow_target'], axis=1, inplace=True)
# cbow_df.rename(columns = {'cbow_w_idx':'w_idx'}, inplace = True)
# sk_df = generate_feature(w_list, opt_list, ans, sk_model, sk_syn1neg, 'sk')
# sk_df.drop(['sk_w_idx'], axis=1, inplace=True)
# sk_df.rename(columns = {'sk_target':'target'}, inplace = True)
# x = pd.concat([cbow_df, sk_df], axis = 1)
# pd.concat([debug, x], axis = 0, ignore_index=True)
# # print(cbow_df)
# # print(sk_df)

In [None]:
model.most_similar(['松烟文创'])

In [None]:
word_dict = ' '.join(list(model.vocab))

In [None]:
for match in re.findall(r'\s(\S*文创\S*)\s', word_dict): 
    print(match)
    

In [15]:
# model[model.index2word[1]]
vocab_size

NameError: name 'vocab_size' is not defined