In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import xgboost as xgb
from os import listdir
from os.path import isfile, join
import re
import jieba
import gensim
from smart_open import smart_open

ANS = ['a', 'b', 'c', 'd', 'e']
FEATURES = ['no', 'w_idx', 'word', 'cos_ref','cos_syn1', 'cos_syn0', 'dist_syn0', 'target']
WINDOW = 10
VEC_SIZE = 100

def simple_preprocess(content):
    pat = '︽⊙＿⊙︽'
    if content.find(pat) < 0: pat = '︽⊙＿⊙'
    
    content = content.strip().replace(pat, '龐燮傍謝')
    wlist = list(jieba.cut(content))
    qidx = []
    i = 0
    for w in wlist:
        if w == '龐燮傍謝':
            wlist[i] = '*'
            qidx.append(i)
        i += 1
    return (wlist, qidx)

def normalize_vec(vec):
    mag = ((vec * vec).sum()) ** 0.5
    return vec / mag

def build_estimate_samples(wlist, qidx):
    global WINDOW
    temp = wlist[:]
    est_sen = []
    sen_len = len(wlist)
    for i in qidx:
        head = max(i - WINDOW, 0)
        tail = min(i + WINDOW, sen_len)
        est_sen.append(wlist[head : i] + wlist[i + 1 : tail])
    return est_sen

def generate_feature(no, w_list, opt_list, ans, syn0_model, syn1, prefix):
    # input sample: 
    # w_list = ['高雄','转','144','次','自强号','1700','高雄','开','1923','到','1940','到','台北']
    # opt_list = ['两用', '阿明', '员林', '碎屑', '精力']
    # ans = 'c'
    # prefix = 'cbow'
    opt_num = len(opt_list)
    ans = opt_list[ANS.index(ans)]  # ans: 'c' --> '员林'
    hidd_vec = np.zeros(VEC_SIZE)
    for w in w_list:
        if w in syn0_model and w != u'*': hidd_vec += syn0_model[w]
    feats = []
    for w in opt_list:
        if w in syn0_model: 
            w_idx = syn0_model.vocab[w].index
            cos_ref = np.dot(syn0_model[w], syn1[w_idx])
#             cos_syn1 = np.dot(hidd_vec, syn1[w_idx])
            cos_syn1 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn1[w_idx]))
#             cos_syn0 = np.dot(hidd_vec, syn0_model[w])
            cos_syn0 = np.dot(normalize_vec(hidd_vec), normalize_vec(syn0_model[w]))
            dist_syn0 = sum((hidd_vec - syn0_model[w]) ** 2)
            if w == ans:
                feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 1])
            else:
                feats.append([no, w_idx, w, cos_ref, cos_syn1, cos_syn0, dist_syn0, 0])
        else:
            pass
    df = pd.DataFrame(feats, columns=[prefix + '_' + f for f in FEATURES])
    cols_to_norm = [prefix + '_' + f for f in FEATURES[4:-1]]
    df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].mean()) / df[cols_to_norm].std()
#     df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].min()) / (df[cols_to_norm].max() - df[cols_to_norm].min())
    return df


def load_model(path, prefix):
    # input sample:
    # path = 'w2v-experiment/model/'
    # prefix = 'sk'
    model = gensim.models.Word2Vec.load_word2vec_format(path + prefix + '-syn0.bin', binary = True)
    vocab_size, vector_size = model.syn0.shape
    syn1neg = np.zeros((vocab_size, vector_size), dtype=np.float32)
    binary_len = np.dtype(np.float32).itemsize * vector_size
    with smart_open(path + prefix + '-syn1neg.bin') as fin:
        for i in range(vocab_size):
            weights = np.fromstring(fin.read(binary_len), dtype=np.float32)
            syn1neg[i] = weights
    return (model, syn1neg)

In [2]:
overall_df = pd.read_csv('question_official/overall_df.csv')
print(overall_df.shape)
overall_df.tail()

(5726, 12)


Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_no,sk_word,sk_cos_ref,sk_cos_syn1,sk_cos_syn0,sk_dist_syn0,target
5721,10438,3.212975,-0.044324,0.853694,-0.839728,999,两用,4.656869,-0.257106,-0.291752,0.330412,0
5722,15661,4.379284,0.123536,-0.253477,0.213438,999,阿明,4.713268,0.607256,0.958114,-0.750441,0
5723,3171,2.885983,1.39245,1.179961,-1.19706,999,员林,6.344345,1.368895,0.831401,-1.211119,1
5724,30408,3.043319,-1.428999,-1.229303,1.189205,999,碎屑,2.994729,-1.186405,-1.518305,1.330615,0
5725,17496,2.659255,-0.042663,-0.550874,0.634144,999,精力,2.87372,-0.53264,0.020542,0.300534,0


In [3]:
overall_df.drop(['sk_no', 'sk_word', 'sk_cos_ref', 'sk_dist_syn0'], axis=1, inplace=True)
print(overall_df.shape)
overall_df.tail()

(5726, 8)


Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_cos_syn1,sk_cos_syn0,target
5721,10438,3.212975,-0.044324,0.853694,-0.839728,-0.257106,-0.291752,0
5722,15661,4.379284,0.123536,-0.253477,0.213438,0.607256,0.958114,0
5723,3171,2.885983,1.39245,1.179961,-1.19706,1.368895,0.831401,1
5724,30408,3.043319,-1.428999,-1.229303,1.189205,-1.186405,-1.518305,0
5725,17496,2.659255,-0.042663,-0.550874,0.634144,-0.53264,0.020542,0


In [4]:
overall_df[['w_idx']] = (overall_df[['w_idx']] - overall_df[['w_idx']].mean()) / overall_df[['w_idx']].std()
overall_df.tail()

Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_cos_syn1,sk_cos_syn0,target
5721,-0.353894,3.212975,-0.044324,0.853694,-0.839728,-0.257106,-0.291752,0
5722,-0.294318,4.379284,0.123536,-0.253477,0.213438,0.607256,0.958114,0
5723,-0.436784,2.885983,1.39245,1.179961,-1.19706,1.368895,0.831401,1
5724,-0.126108,3.043319,-1.428999,-1.229303,1.189205,-1.186405,-1.518305,0
5725,-0.273387,2.659255,-0.042663,-0.550874,0.634144,-0.53264,0.020542,0


In [5]:
data, target = overall_df.drop(['target'],axis=1).values, overall_df['target'].values


In [6]:
# logistic regression approach
clf = LogisticRegression()
scores = cross_validation.cross_val_score(clf, data, target, cv=5)
print(scores)

[ 0.96073298  0.95811518  0.9510917   0.92227074  0.90996503]


In [12]:
# xgboost regression approach
param = {'max_depth':4, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
dtrain = xgb.DMatrix( data, label=target)
num_round = 3
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=2,
       metrics={'error'}, seed = 0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

running cross validation
[0]	train-error:0.0548375+0.0017465	test-error:0.0644425+0.0012225
[1]	train-error:0.0501225+0.0026195	test-error:0.064792+0.003318
[2]	train-error:0.046455+0.001048	test-error:0.063919+0.001397


Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.064442,0.001223,0.054837,0.001747
1,0.064792,0.003318,0.050123,0.00262
2,0.063919,0.001397,0.046455,0.001048


In [75]:
path = 'question_samples/'
qdf = pd.DataFrame()
onlyfiles = [f for f in listdir(path) if f[0] == '2']
for file in onlyfiles:
    samples = []
    i = 0
    with open(path + file, 'r') as f:
        for line in f:
            i += 1
            try:
                no, content, a, b, c, d, e = re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[END\]', line)[0]
                samples.append([file, no, content.strip(), a.strip(), b.strip(), c.strip(), d.strip(), e.strip()])
            except:
                pass
    print('{}, '.format(i), end="")
    _qdf = pd.DataFrame(samples, columns=['file', 'no', 'content', 'a', 'b', 'c', 'd', 'e'])
    _ans = line.split()
    for _a in _ans:
        if _a not in ANS:
#             print(file)
            continue
    if len(_ans) != i - 1:
#         print(file)
        continue
    ans = [_ans[int(s)-1] for s in _qdf.no.values]
    _qdf = pd.concat([_qdf, pd.Series(ans, name='ans')], axis=1)
    qdf = pd.concat([qdf, _qdf], axis=0, ignore_index=True)
print('')

print(qdf.shape)
clean_qdf = qdf.drop_duplicates(subset=['content', 'a', 'b', 'c', 'd', 'e', 'ans'])
print(clean_qdf.shape)
clean_qdf.tail()

6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 21, 11, 11, 11, 11, 
(1366, 9)
(550, 9)


Unnamed: 0,file,no,content,a,b,c,d,e,ans
1095,2016-08-10-01-52-38.txt,1,很想約人，但太遠，騎機車載人又太冷，找人開車對司機又不好意思，光車程來回6小時，就自己速去速...,西門町,智恩寺,脆口,媽媽,濕地,a
1096,2016-08-10-01-52-38.txt,2,︽⊙＿⊙︽的燕子口有個印地安人頭像，這兒則據說有︽⊙＿⊙︽酋長頭像，但我們並未找到正確位置,野口,缺點,台塑,太魯閣,金屬,d
1097,2016-08-10-01-52-38.txt,3,或是擦除不小心沾到︽⊙＿⊙︽的皮膚也可以直接使用mdmmd.極緻水漾除彩液，還蠻好清除的,prada,靴子,指甲油,cleansing,蛋型,c
1098,2016-08-10-01-52-38.txt,4,************************ 本文為︽⊙＿⊙︽邀稿 **********...,小木馬,理想大地,葉記,新湖,阿灶伯,b
1099,2016-08-10-01-52-38.txt,5,我們住在埔里︽⊙＿⊙︽的稻湘村民宿，司機駕駛功力一流，狹窄的山路依然開得很妥當，一到民宿我們...,儲水,外婆,橋頭,錦記,山上,e


In [76]:
clean_qdf.to_csv('question_samples/raw_samples.csv', index=False)

In [77]:
sample_df = pd.read_csv('question_samples/raw_samples_cn.csv')
print(sample_df.shape)
path = 'question_samples/'
qdf = pd.DataFrame()

print(sample_df[['a','b','c','d','e']].iloc[0].tolist())
print(sample_df['ans'].iloc[0])
wlist, qidx = simple_preprocess(sample_df['content'].iloc[0])
print(wlist, qidx)
# build_estimate_samples(wlist, qidx)

(550, 9)
['国道', '小肚', 'gucci', '秋吉', '硫酸']
c
['竹节', '包', '的', '出现', '是', '来自', '战后', '物质', '匮乏', '的', '1947', '年', '\u3000', '*', '工匠', '们', '独具', '巧心', '和', '创意', '的', '创作'] [13]


In [78]:
nrow, ncol = sample_df.shape
train_df = pd.DataFrame()
for i in range(nrow):
    no = sample_df['no'].iloc[i]
    wlist, qidx = simple_preprocess(sample_df['content'].iloc[i])
    opt_list = sample_df[['a','b','c','d','e']].iloc[i].tolist()
    ans = sample_df['ans'].iloc[i]
    sen_list = build_estimate_samples(wlist, qidx)
    for w_list in sen_list:
        cbow_df = generate_feature(no, w_list, opt_list, ans, cbow_model, cbow_syn1neg, 'cbow')
        cbow_df.drop(['cbow_target', 'cbow_no', 'cbow_word'], axis=1, inplace=True)
        cbow_df.rename(columns = {'cbow_w_idx':'w_idx'}, inplace = True)
        sk_df = generate_feature(no, w_list, opt_list, ans, sk_model, sk_syn1neg, 'sk')
        sk_df.drop(['sk_w_idx'], axis=1, inplace=True)
        sk_df.rename(columns = {'sk_target':'target'}, inplace = True)
        df = pd.concat([cbow_df, sk_df], axis = 1)
        train_df = pd.concat([train_df, df], axis = 0, ignore_index=True)
print(train_df.shape)
train_df.tail()

(3182, 12)


Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_no,sk_word,sk_cos_ref,sk_cos_syn1,sk_cos_syn0,sk_dist_syn0,target
3177,37218,5.432349,-0.206887,-0.272326,0.562308,5,储水,5.660539,0.016829,-0.863556,0.306192,0
3178,12982,2.979897,-0.650059,-0.087021,0.002072,5,外婆,1.508111,-1.007808,0.257491,0.652748,0
3179,11768,3.416854,-0.180517,-1.277707,1.09554,5,桥头,4.009564,-0.492371,-0.115242,0.112729,0
3180,163229,3.471575,-0.698616,0.130863,-0.086343,5,锦记,5.576158,-0.164263,-0.846522,0.666917,0
3181,3641,2.041267,1.736079,1.50619,-1.573576,5,山上,1.661321,1.647613,1.567829,-1.738586,1


In [79]:
sample_df.iloc[i]
# print(wlist)
# sample_df[sample_df['file'] == '2016-08-12-18-06-27.txt']

file                                 2016-08-10-01-52-38.txt
no                                                         5
content    我们住在埔里︽⊙＿⊙︽的稻湘村民宿，司机驾驶功力一流，狭窄的山路依然开得很妥当，一到民宿我们...
a                                                         储水
b                                                         外婆
c                                                         桥头
d                                                         锦记
e                                                         山上
ans                                                        e
Name: 549, dtype: object

In [81]:
# train_df.to_csv('question_samples/train_df.csv', index=False)

In [17]:
train_df = pd.read_csv('question_samples/train_df.csv')
train_df.drop(['sk_no', 'sk_word'], axis=1, inplace=True)
train_df[['w_idx']] = (train_df[['w_idx']] - train_df[['w_idx']].mean()) / train_df[['w_idx']].std()
data, target = train_df.drop(['target'],axis=1).values, train_df['target'].values
print(train_df.shape)
train_df.tail()

(3182, 10)


Unnamed: 0,w_idx,cbow_cos_ref,cbow_cos_syn1,cbow_cos_syn0,cbow_dist_syn0,sk_cos_ref,sk_cos_syn1,sk_cos_syn0,sk_dist_syn0,target
3177,-0.095949,5.432349,-0.206887,-0.272326,0.562308,5.660539,0.016829,-0.863556,0.306192,0
3178,-0.372958,2.979897,-0.650059,-0.087021,0.002072,1.508111,-1.007808,0.257491,0.652748,0
3179,-0.386834,3.416854,-0.180517,-1.277707,1.09554,4.009564,-0.492371,-0.115242,0.112729,0
3180,1.344312,3.471575,-0.698616,0.130863,-0.086343,5.576158,-0.164263,-0.846522,0.666917,0
3181,-0.479723,2.041267,1.736079,1.50619,-1.573576,1.661321,1.647613,1.567829,-1.738586,1


In [33]:
# xgboost regression approach
param = {'max_depth':3, 'eta':0.1, 'silent':1, 'objective':'binary:logistic'}
dtrain = xgb.DMatrix( data, label=target)
num_round = 10
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=2,
       metrics={'error'}, seed = 0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

running cross validation
[0]	train-error:0.0461975+0.0009425	test-error:0.067253+0
[1]	train-error:0.0461975+0.0009425	test-error:0.067253+0
[2]	train-error:0.0461975+0.0009425	test-error:0.065682+0.001571
[3]	train-error:0.045569+0.000943	test-error:0.062225+0.005028
[4]	train-error:0.0452545+0.0006285	test-error:0.059711+0.005028
[5]	train-error:0.0449405+0.0003145	test-error:0.060654+0.003457
[6]	train-error:0.0452545+0.0006285	test-error:0.060968+0.003143
[7]	train-error:0.043997+0	test-error:0.0600255+0.0040855
[8]	train-error:0.043683+0.000314	test-error:0.0603395+0.0031425
[9]	train-error:0.042112+0	test-error:0.0603395+0.0031425


Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.067253,0.0,0.046198,0.000943
1,0.067253,0.0,0.046198,0.000943
2,0.065682,0.001571,0.046198,0.000943
3,0.062225,0.005028,0.045569,0.000943
4,0.059711,0.005028,0.045255,0.000629
5,0.060654,0.003457,0.04494,0.000314
6,0.060968,0.003143,0.045255,0.000629
7,0.060025,0.004086,0.043997,0.0
8,0.060339,0.003142,0.043683,0.000314
9,0.060339,0.003142,0.042112,0.0


In [19]:
# logistic regression approach: cross validation
clf = LogisticRegression()
scores = cross_validation.cross_val_score(clf, data, target, cv=5)
print(scores)
# print()

[ 0.94034537  0.94191523  0.94191523  0.94496855  0.9480315 ]


In [20]:
# logistic regression approach: train model
clf = LogisticRegression()
clf.fit(data, target)
clf.coef_

array([[-0.31425972, -0.17796383,  2.5085388 , -1.79239388, -1.95315814,
        -0.02313124,  1.88269079, -0.23159453, -0.18966862]])

In [22]:
# save model
from sklearn.externals import joblib
joblib.dump(clf, 'w2v-experiment/model/lrmodel_cbow+sk.pkl') 


['w2v-experiment/model/lrmodel_cbow+sk.pkl',
 'w2v-experiment/model/lrmodel_cbow+sk.pkl_01.npy',
 'w2v-experiment/model/lrmodel_cbow+sk.pkl_02.npy',
 'w2v-experiment/model/lrmodel_cbow+sk.pkl_03.npy',
 'w2v-experiment/model/lrmodel_cbow+sk.pkl_04.npy']

In [23]:
clf = joblib.load('w2v-experiment/model/lrmodel_cbow+sk.pkl')
a = clf.predict_proba(data)
print(target[:10])
print(a[:10,1].shape)


[0 0 1 0 0 0 1 0 0 0]
(10,)


In [34]:
# xgboost regression approach
param = {'max_depth':3, 'eta':0.1, 'silent':1, 'objective':'binary:logistic'}
dtrain = xgb.DMatrix( data, label=target)
bst = xgb.train(param, dtrain, 10)

In [35]:
# bst.save_model('w2v-experiment/model/xgb.model')

In [36]:
bst = xgb.Booster({'nthread':4})
bst.load_model('w2v-experiment/model/xgb.model')

In [37]:
a = bst.predict(dtrain)
print(a[:10])
print(target[:10])
print(data.shape)

[ 0.18864925  0.18864925  0.54521072  0.20367649  0.18864925  0.18864925
  0.77695411  0.30488086  0.19640976  0.18864925]
[0 0 1 0 0 0 1 0 0 0]
(3182, 9)
