In [108]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from os import listdir
from os.path import isfile, join
import re
import jieba
import gensim
from smart_open import smart_open
from sklearn.externals import joblib
import xgboost as xgb

ANS = ['a', 'b', 'c', 'd', 'e']
FEATURES = ['no', 'w_idx', 'word', 'cos_ref','cos_syn1', 'cos_syn0', 'dist_syn0', 'target']
WINDOW = 10
VEC_SIZE = 100

def simple_preprocess(content):
    pat = '︽⊙＿⊙︽'
    if content.find(pat) < 0: pat = '︽⊙＿⊙'
    
    content = content.strip().replace(pat, '龐燮傍謝')
    wlist = list(jieba.cut(content))
    qidx = []
    i = 0
    for w in wlist:
        if w == '龐燮傍謝':
            wlist[i] = '*'
            qidx.append(i)
        i += 1
    return (wlist, qidx)

def normalize_vec(vec):
    mag = ((vec * vec).sum()) ** 0.5
    return vec / mag

def build_estimate_samples(wlist, qidx):
    temp = wlist[:]
    est_sen = []
    sen_len = len(wlist)
    for i in qidx:
        head = max(i - WINDOW, 0)
        tail = min(i + WINDOW, sen_len)
        est_sen.append(wlist[head : i] + wlist[i + 1 : tail])
    return est_sen

def generate_feature(no, w_list, opt_list, syn0_model, syn1, prefix, ans = None):
    # input sample: 
    # w_list = ['高雄','转','144','次','自强号','1700','高雄','开','1923','到','1940','到','台北']
    # opt_list = ['两用', '阿明', '员林', '碎屑', '精力']
    # ans = 'c'
    # prefix = 'cbow'
    opt_num = len(opt_list)
    if ans: ans = opt_list[ANS.index(ans)]  # ans: 'c' --> '员林'
    hidd_vec = np.zeros(VEC_SIZE)
    for w in w_list:
        if w in syn0_model and w != u'*': hidd_vec += syn0_model[w]
    feats = []
    for w in opt_list:
        if w in syn0_model: 
            w_idx = syn0_model.vocab[w].index
            cos_ref = np.dot(syn0_model[w], syn1[w_idx])
#             cos_syn1 = np.dot(hidd_vec, syn1[w_idx])
            cos_syn1 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn1[w_idx]))
#             cos_syn0 = np.dot(hidd_vec, syn0_model[w])
            cos_syn0 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn0_model[w]))
            dist_syn0 = sum((hidd_vec - syn0_model[w]) ** 2)
            feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0])
#             if w == ans:
#                 feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 1])
#             else:
#                 feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 0])
        else:
            feats.append([no, 0, '</s>', 0, 0, 0, 0])
            print(no,w, opt_list)
            
    df = pd.DataFrame(feats, columns=[prefix + '_' + f for f in FEATURES[:-1]])
    cols_to_norm = [prefix + '_' + f for f in FEATURES[4:-1]]
    df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].mean()) / df[cols_to_norm].std()
#     df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].min()) / (df[cols_to_norm].max() - df[cols_to_norm].min())
    return df


def load_model(path, prefix):
    # input sample:
    # path = 'w2v-experiment/model/'
    # prefix = 'sk'
    model = gensim.models.Word2Vec.load_word2vec_format(path + prefix + '-syn0.bin', binary = True)
    vocab_size, vector_size = model.syn0.shape
    syn1neg = np.zeros((vocab_size, vector_size), dtype=np.float32)
    binary_len = np.dtype(np.float32).itemsize * vector_size
    with smart_open(path + prefix + '-syn1neg.bin') as fin:
        for i in range(vocab_size):
            weights = np.fromstring(fin.read(binary_len), dtype=np.float32)
            syn1neg[i] = weights
    return (model, syn1neg)

In [22]:
# jieba.add_word('龐燮傍謝', freq=10, tag='xx')
# jieba.load_userdict('data/zhwiki-cn-clean')
# jieba.load_userdict('data/dict-txt-big')
# path = 'w2v-experiment/model/'
# cbow_model, cbow_syn1neg = load_model(path, 'cbow')
# sk_model, sk_syn1neg = load_model(path, 'sk')

In [87]:
lrmodel = joblib.load('w2v-experiment/model/lrmodel_cbow+sk.pkl')
bst = xgb.Booster({'nthread':4})
bst.load_model('w2v-experiment/model/xgb.model')

In [114]:
i = 3
pre_path = 'question_samples/battle/preliminary/'
pre_file = 'round' + str(i) + '-cn.txt'
# pre_path = 'question_samples/battle/final/'
# pre_file = 'all-cn.txt'
ANSWER_LIST = []

with open(pre_path + pre_file, 'r') as f:
    for line in f:
        j += 1
        try:
            no, content, a, b, c, d, e = re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[end\]', line.lower())[0]
        except:
            break
        wlist, qidx = simple_preprocess(content.strip())
        opt_list = [a.strip(), b.strip(), c.strip(), d.strip(), e.strip()]
        sen_list = build_estimate_samples(wlist, qidx)
        n_sen = len(sen_list)
        PREDICT = np.zeros((n_sen, 5))
        for i in range(n_sen):
            cbow_df = generate_feature(no, sen_list[i], opt_list, cbow_model, cbow_syn1neg, 'cbow')
            cbow_df.drop(['cbow_no', 'cbow_word'], axis=1, inplace=True)
            cbow_df.rename(columns = {'cbow_w_idx':'w_idx'}, inplace = True)
            sk_df = generate_feature(no, sen_list[i], opt_list, sk_model, sk_syn1neg, 'sk')
            sk_df.drop(['sk_w_idx'], axis=1, inplace=True)
            df = pd.concat([cbow_df, sk_df], axis = 1)
            df.drop(['sk_no', 'sk_word'], axis=1, inplace=True)
            df[['w_idx']] = (df[['w_idx']] - df[['w_idx']].mean()) / df[['w_idx']].std()
#             dtrain = xgb.DMatrix(data)
#             PREDICT[i, :] = bst.predict(dtrain)
#             PREDICT[i, :] = df['cbow_cos_syn1'].values
            PREDICT[i, :] = lrmodel.predict_proba(df.values)[:,1]
        ANSWER_LIST.append(ANS[PREDICT.mean(axis=0).argmax()])
        
n = len(ANSWER_LIST)
REF_LIST = line.split()
score = 0
for i in range(n):
    if REF_LIST[i] == ANSWER_LIST[i]: score += (100/float(n))
print(score)

1 蚊液 ['纪念堂', '脾气', '茶道', 'line', '蚊液']
1 蚊液 ['纪念堂', '脾气', '茶道', 'line', '蚊液']
75.0


In [97]:
PREDICT

array([[  6.30329689e-05,   9.69766385e-01,   5.15767759e-03,
          2.63725882e-02,   2.26159962e-02]])

In [None]:
# in final round
# OK-Computer got 4 wrong answers out of 22 questions