In [31]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import re
# import jieba
import gensim
from smart_open import smart_open
import codecs
import random
from RetrievalTask import parallel_retrieve
import time
from celery import group
import logging
ANS = ['a', 'b', 'c', 'd', 'e']
# FEATURES = ['no', 'w_idx', 'word', 'cos_ref','cos_syn1', 'cos_syn0', 'dist_syn0', 'target']
WINDOW = 10
VEC_SIZE = 100

class KeyRetrieval(object):
    
    def __init__(self, raw_content, query_num = 7, query_len = 20):
        self.query_num = query_num
        self.query_len = query_len
        self.raw_content = raw_content
        self.rand_query = []
        self.add_rand_query()
    def add_rand_query(self, content = None):
        if content == None: content = self.raw_content
        match = list(re.finditer('︽⊙＿⊙︽', content))
        match_num = len(match)
        content_len = len(content)
        for i in range(self.query_num):
            k = random.choice(range(match_num))
            m = match[k]
            if m.span()[0] > 0:
                ihead = random.choice(range(max(0, m.span()[0] - self.query_len), m.span()[0]))
            else: ihead = 0
            if m.span()[1] < content_len:
                iend = random.choice(range(m.span()[1], min(len(content), m.span()[1] + self.query_len)))
            else: iend = content_len - 1
            
            if k > 0: 
                pre_m = match[k - 1]
                if ihead < pre_m.span()[1]: ihead = pre_m.span()[1]
            
            if k < match_num - 1: 
                aft_m = match[k + 1]
                if iend >= aft_m.span()[0]: iend = aft_m.span()[0] - 1
            
            q = content[ihead : iend + 1]
            if m.span()[0] == 0: 
                q = '\n' + q
            if m.span()[1] == content_len: q = q + '\n'
            self.rand_query.append(self.format_query(q))
    def set_rand_query(self):
        self.rand_query = []
        self.add_rand_query()

    def retrieve(self):
        start_time = time.time()
        key = []
        jobs = group(parallel_retrieve.s(q) for q in self.rand_query)
        result = jobs.apply_async()
#         result.wait(timeout = 1)
        result.ready()
        result.successful()
        all_match = result.get()
        
        for match in all_match:
            if len(match) > 0:
                print(match,',',end='')
                key.append(match)
        print('')
#         for q in self.rand_query:
#             try:
#                 match = re.findall(q, SEN_POOL)
#                 for m in match: 
#                     if m: key.append(m)
#             except:
#                 print('retrieval exception: ' + q)
#                 continue
                
        logging.warning('elapsed time:' + str(time.time() - start_time))
        if len(key) > 0:
            count = {}
            for w in key:
                if w in count:
                    count[w] += 1
                else: count[w] = 0
            return sorted(count.items(), reverse=True, key=lambda tup: tup[1])[0][0]
            
                  
    def format_query(self, raw_query = None):
        if raw_query == None: raw_query = self.raw_content
        return re.sub(r'([\.\^\$\|\*\+\?\\\{\}\[\]\(\)])', r'\\\1', raw_query).replace('︽⊙＿⊙︽', '(.{2,10})')
    
        

def load_model(path, prefix):
    # input sample:
    # path = 'w2v-experiment/model/'
    # prefix = 'sk'
    model = gensim.models.Word2Vec.load_word2vec_format(path + prefix + '-syn0.bin', binary = True)
    vocab_size, vector_size = model.syn0.shape
    syn1neg = np.zeros((vocab_size, vector_size), dtype=np.float32)
    binary_len = np.dtype(np.float32).itemsize * vector_size
    with smart_open(path + prefix + '-syn1neg.bin') as fin:
        for i in range(vocab_size):
            weights = np.fromstring(fin.read(binary_len), dtype=np.float32)
            syn1neg[i] = weights
    syn1neg[0, :] = syn1neg[1 : vocab_size/2, :].mean(axis=0)
    model.syn0[0, :] = model.syn0[1 : vocab_size/2, :].mean(axis=0)
    return (model, syn1neg)


def simple_preprocess_A(line):
    try:
        no, content, a, b, c, d, e = re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[end\]', line.lower())[0]
    except:
        return {'ans': line.split()}
    return {'no': no, 'content': content, 'opt_list': [a.strip(), b.strip(), c.strip(), d.strip(), e.strip()]}

def normalize_vec(vec):
    mag = ((vec * vec).sum()) ** 0.5
    return vec / mag


def w2v_cosine(key, opt_list, syn0_model, syn1neg):
    arr = np.zeros(len(opt_list))
    if key in syn0_model:
        hidd_vec = normalize_vec(syn0_model[key])
        for i in range(len(opt_list)):
            if opt_list[i] in syn0_model:
                w_idx = syn0_model.vocab[opt_list[i]].index
#                 arr[i] = np.dot(hidd_vec, (syn0_model[opt_list[i]])) \
#                        + np.dot(hidd_vec, (syn1neg[w_idx]))
#                 arr[i] = np.dot(hidd_vec, (syn1neg[w_idx]))
                arr[i] = np.dot(hidd_vec, normalize_vec(syn0_model[opt_list[i]])) \
                       + np.dot(hidd_vec, normalize_vec(syn1neg[w_idx]))
            else:
#                 arr[i] = np.dot(hidd_vec, (syn1neg[0]))
                arr[i] = np.dot(hidd_vec, normalize_vec(syn0_model.syn0[0])) \
                       + np.dot(hidd_vec, normalize_vec(syn1neg[0]))
                print(arr[i])
        return ANS[arr.argmax()]
                
    else:
        return '0'

In [2]:
path = 'model/'
cbow_model, cbow_syn1neg = load_model(path, 'cbow')

In [26]:
# SEN_POOL = ''
# with codecs.open('../data/content-cn-lower.txt', 'r', encoding='utf-8') as f:
#     outer_buf = []
#     inner_buf = [''] * 10000
#     buf_idx = 0
#     for line in f:
#         inner_buf[buf_idx] = line
#         buf_idx += 1
#         if buf_idx >= 10000: 
#             buf_idx = 0
#             outer_buf.append(''.join(inner_buf))
#     outer_buf.append(''.join(inner_buf[:buf_idx]))

# print(len(outer_buf))
# SEN_POOL = ''.join(outer_buf)
# print(repr(SEN_POOL[:100]))

1168
'integrate 根根飞扬睫毛膏 升级版 6g $300\n这是跟integrate合作试用的最后一篇啦 v(｡･ω･｡)ｨｪｨ\n旧款的我没有用过\n不过这次用的新款给我的感觉还蛮好的\n深紫色的管身包装'


In [1]:
# sample = '发亮的程度直接可以媲美我以/前大费周章的用隔离霜+︽⊙＿⊙︽+蜜粉+打亮修容'
# sample = re.sub(r'([\.\^\$\*\+\?\\\{\}\[\]\(\)])', r'\\\1',sample)
sample = '\n(.{2,9})这双2014秋冬的simone过膝靴是笔挺率性的风格。如果想要笔直长靴的造型，simone是满分。皮又厚又挺，穿过也不皱不变形。而且雾面和剪裁都很修饰腿型，也格外适合搭裤装'
# q = KeyRetrieval(sample)
# q.rand_query
m = re.findall(sample, SEN_POOL)[0]
# print(str(m))
# k = SEN_POOL.find(sample)
# SEN_POOL[k - 20 : k + len(sample) + 10]

NameError: name 're' is not defined

In [40]:
i = 1
pre_path = '../question_samples/battle/preliminary/'
pre_file = 'round' + str(i) + '-cn.txt'
# pre_path = '../question_samples/battle/final/'
# pre_file = 'all-cn.txt'
PRED_LIST = []
with codecs.open(pre_path + pre_file, 'r', encoding='utf-8') as f:
    j = 1
    for line in f:
        d = simple_preprocess_A(line.lower())
        if 'content' in d:
            content = d['content']
#             print(d['no'], d['content'])
            q = KeyRetrieval(content)
            key = q.retrieve()
            print(d['no'], key)
            if key:
                found = False
                for i in range(len(d['opt_list'])):
                    if d['opt_list'][i] in key or key in d['opt_list'][i]:
                        PRED_LIST.append((len(content), ANS[i]))
                        found = True
                        break
                if not found: PRED_LIST.append((len(content), w2v_cosine(key, d['opt_list'], cbow_model, cbow_syn1neg)))
            else:
                print('dunt no')
                PRED_LIST.append((len(content), random.choice(ANS)))
        j += 1
#         if j > 13: break
# key



洗面乳 ,洗面乳 ,洗面乳 ,洗面乳 ,洗面乳 ,洗面乳 ,洗面乳 ,
1 洗面乳




三芝 ,三芝 ,三芝 ,三芝 ,三芝 ,三芝 ,三芝 ,
2 三芝




的沾取棒 ,精华液 ,赖床粉底一上市, ,精华液 ,发膜是 ,
3 精华液




苏澳 ,苏澳 ,苏澳 ,苏澳 ,苏澳 ,用餐 ,苏澳 ,
4 苏澳




恒春 ,恒春 ,恒春 ,恒春 ,恒春 ,恒春 ,
5 恒春




花莲市 ,花莲市 ,花莲市 ,花莲市 ,花莲市 ,花莲市 ,花莲市 ,
6 花莲市




山上 ,山上 ,山上 ,山上 ,山上 ,山上 ,山上 ,
7 山上




保湿精华 ,保湿精华 ,保湿精华 ,保湿精华 ,保湿精华 ,保湿精华 ,
8 保湿精华




粉底液 ,粉底液 ,粉底液 ,
9 粉底液




蜜粉有没 ,蜜粉 ,蜜粉 ,
10 蜜粉




遮瑕 ,遮瑕 ,遮瑕 ,
11 遮瑕




玉里 ,玉里 ,个旅游城 ,玉里 ,玉里 ,玉里 ,玉里 ,
12 玉里




乌来 ,乌来 ,乌来 ,乌来 ,乌来 ,乌来 ,乌来 ,
13 乌来




书店 ,书店 ,书店 ,书店 ,书店 ,书店 ,
14 书店




稻田中 ,稻田中 ,稻田中 ,稻田中 ,稻田中 ,稻田中 ,稻田中 ,
15 稻田中




gucci ,gucci ,gucci ,gucci ,gucci ,gucci ,gucci ,
16 gucci




台南 ,台南 ,台南 ,台南 ,
17 台南





18 None
dunt no




ladurÉe ,ladurÉe ,ladurÉe ,ladurÉe ,ladurÉe ,ladurÉe ,ladurÉe ,
19 ladurÉe




孔庙 ,孔庙 ,孔庙 ,孔庙 ,孔庙 ,孔庙 ,
20 孔庙
0.119266480207


In [41]:
score = 0
pred = [x[1] for x in PRED_LIST]
for i in range(len(pred)):
    if pred[i] == d['ans'][i]:  score += (100. / len(pred))
print(pred)
print(d['ans'])
print(score)

['c', 'a', 'b', 'c', 'd', 'b', 'b', 'b', 'e', 'e', 'd', 'c', 'b', 'b', 'c', 'd', 'd', 'e', '0', 'd']
['c', 'a', 'b', 'c', 'd', 'b', 'b', 'b', 'e', 'e', 'd', 'c', 'b', 'b', 'c', 'd', 'd', 'b', 'c', 'd']
90.0


In [186]:
sample_df = pd.read_csv('../question_samples/raw_samples_cn.csv')
print(sample_df.shape)
sample_df.tail()

(550, 9)


Unnamed: 0,file,no,content,a,b,c,d,e,ans
545,2016-08-10-01-52-38.txt,1,很想约人，但太远，骑机车载人又太冷，找人开车对司机又不好意思，光车程来回6小时，就自己速去速...,西门町,智恩寺,脆口,妈妈,湿地,a
546,2016-08-10-01-52-38.txt,2,︽⊙＿⊙︽的燕子口有个印地安人头像，这儿则据说有︽⊙＿⊙︽酋长头像，但我们并未找到正确位置,野口,缺点,台塑,太鲁阁,金属,d
547,2016-08-10-01-52-38.txt,3,或是擦除不小心沾到︽⊙＿⊙︽的皮肤也可以直接使用mdmmd.极致水漾除彩液，还蛮好清除的,prada,靴子,指甲油,cleansing,蛋型,c
548,2016-08-10-01-52-38.txt,4,************************ 本文为︽⊙＿⊙︽邀稿 **********...,小木马,理想大地,叶记,新湖,阿灶伯,b
549,2016-08-10-01-52-38.txt,5,我们住在埔里︽⊙＿⊙︽的稻湘村民宿，司机驾驶功力一流，狭窄的山路依然开得很妥当，一到民宿我们...,储水,外婆,桥头,锦记,山上,e


In [187]:
official_df = pd.read_csv('../question_official/overall_df.csv')
print(official_df.shape)
official_df.tail()

(5726, 12)


Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_no,sk_word,sk_cos_ref,sk_cos_syn1,sk_cos_syn0,sk_dist_syn0,target
5721,10438,3.212975,-0.044324,0.853694,-0.839728,999,两用,4.656869,-0.257106,-0.291752,0.330412,0
5722,15661,4.379284,0.123536,-0.253477,0.213438,999,阿明,4.713268,0.607256,0.958114,-0.750441,0
5723,3171,2.885983,1.39245,1.179961,-1.19706,999,员林,6.344345,1.368895,0.831401,-1.211119,1
5724,30408,3.043319,-1.428999,-1.229303,1.189205,999,碎屑,2.994729,-1.186405,-1.518305,1.330615,0
5725,17496,2.659255,-0.042663,-0.550874,0.634144,999,精力,2.87372,-0.53264,0.020542,0.300534,0


In [46]:
sample = '发亮的程度直接可以媲美我以︽⊙＿⊙︽前大费周章的用隔离霜+︽⊙＿⊙︽.'
m = re.search('︽⊙＿⊙︽', sample)
print(m.span()[0], m.span()[1])
# print(len(sample))
# print(sample[10:34])

13 18


In [29]:
import random
random.choice(m)

<_sre.SRE_Match object; span=(29, 34), match='︽⊙＿⊙︽'>

In [177]:
count = {'a':6, 'b':4,"c": 9}
sorted(count.items(), reverse=True, key=lambda tup: tup[1])[0][0]

'c'