In [1]:
import sys
import collections
import mxnet as mx
from mxnet import autograd, gluon, init, metric, nd
from mxnet.gluon import loss as gloss, nn, rnn
from mxnet.contrib import text
import os
import random
import zipfile
from sklearn.model_selection import train_test_split
import spacy
import time
from time import strftime
from gensim import corpora
from gensim import models

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
ROOT_PATH  = "data/"

In [3]:
def noation_replace(line):
    notations = ['?', '¿', ',', '.', '¡','!', ':', ",", ';', '-', '']
    output = line.strip()
    for i in notations:
        output = output.replace(i, '')
    return output.split('\t')

In [4]:
es_e_l = []
es_s_l = []
es_e_r = []
es_s_r = []
es_labels = []
## english-spanish text 
with open(os.path.join(ROOT_PATH, "cikm_english_train_20180516.txt"), 'r', encoding='utf-8') as esf:
    for line in esf:
        segs = noation_replace(line)
        es_e_l.append(segs[0].lower())
        es_e_r.append(segs[2].lower())
        es_s_l.append(segs[1].lower())
        es_s_r.append(segs[3].lower())
        es_labels.append(int(segs[4]))
        
se_e_l = []
se_s_l = []
se_e_r = []
se_s_r = []
se_labels = []
## spanish-english text
with open(os.path.join(ROOT_PATH, "cikm_spanish_train_20180516.txt"), 'r', encoding='utf-8') as ssf:
    for line in ssf:
        segs = noation_replace(line)
        se_s_l.append(segs[0].lower())
        se_s_r.append(segs[2].lower())
        se_e_l.append(segs[1].lower())
        se_e_r.append(segs[3].lower())
        se_labels.append(int(segs[4]))

test_s_1 = []
test_s_2 = []
## spanish test file
with open(os.path.join(ROOT_PATH, "cikm_test_a_20180516.txt"), 'r', encoding='utf-8') as tef:
    for line in tef:
        segs = noation_replace(line)
        test_s_1.append(segs[0].lower())
        test_s_2.append(segs[1].lower())

print("es data size:", len(es_s_l))
print("se data size:", len(se_e_l))
print("test data size:", len(test_s_1))

es data size: 20000
se data size: 1400
test data size: 5000


In [5]:
##add data sets (s0, s1, y) + (s1, s0, y)
left_texts = es_s_l + se_s_l
right_texts = es_s_r + se_s_r
y = es_labels + se_labels

print("left data size:", len(left_texts))
print("right data size:", len(right_texts))
print("label size:", len(y))

left data size: 21400
right data size: 21400
label size: 21400


In [7]:
span_sw = []
with open(os.path.join(ROOT_PATH, "spanish.txt"), 'r', encoding='utf-8') as swf:
    for line in swf:
        word = line.strip()
        span_sw.append(word)

In [8]:
spacy_en = spacy.load('en')
spacy_es = spacy.load('es')

def en_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def es_tokenizer(text):
    return [tok.text for tok in spacy_es.tokenizer(text) if tok.text not in span_sw]

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]


In [9]:
left_tokenized = []
right_tokenized = []
test_left_tokenized = []
test_right_tokenized = []

for token in left_texts:
    left_tokenized.append(es_tokenizer(token))

for token in right_texts:
    right_tokenized.append(es_tokenizer(token))
    
for token in test_s_1:
    test_left_tokenized.append(es_tokenizer(token))

for token in test_s_2:
    test_right_tokenized.append(es_tokenizer(token))
    
    
token_counter = collections.Counter()
for sample in left_tokenized:
    for token in sample:
        if token not in token_counter:
            token_counter[token] = 1
        else:
            token_counter[token] += 1
            
for sample in right_tokenized:
    for token in sample:
        if token not in token_counter:
            token_counter[token] = 1
        else:
            token_counter[token] += 1
            
for sample in test_left_tokenized:
    for token in sample:
        if token not in token_counter:
            token_counter[token] = 1
        else:
            token_counter[token] += 1

for sample in test_right_tokenized:
    for token in sample:
        if token not in token_counter:
            token_counter[token] = 1
        else:
            token_counter[token] += 1
            
print("unique token count :", len(token_counter))

unique token count : 5824


In [10]:
# 根据词典，将数据转换成特征向量。
def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in vocab.token_to_idx:
                feature.append(vocab.token_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)         
    return features

def pad_samples(features, maxlen=500, padding=0):
    padded_features = []
    for feature in features:
        if len(feature) > maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            # 添加 PAD 符号使每个序列等长（长度为 maxlen ）。
            while len(padded_feature) < maxlen:
                padded_feature.append(padding)
        padded_features.append(padded_feature)
    return padded_features

In [11]:
vocab = text.vocab.Vocabulary(token_counter, unknown_token='<unk>', reserved_tokens=None)

left_texts_features = encode_samples(left_tokenized, vocab)
right_texts_featrues = encode_samples(right_tokenized, vocab)

left_padded_features = pad_samples(left_texts_features, 30, 0)
right_texts_featrues = pad_samples(right_texts_featrues, 30, 0)

test_left_features = encode_samples(test_left_tokenized, vocab)
test_right_featrues = encode_samples(test_right_tokenized, vocab)

test_left_padded_features = pad_samples(test_left_features, 30, 0)
test_right_texts_featrues = pad_samples(test_right_featrues, 30, 0)

In [12]:
num_validation_samples = int(0.2 * len(left_padded_features))

left_x_train = left_padded_features[:-num_validation_samples]
left_x_val = left_padded_features[-num_validation_samples:]

right_x_train = right_texts_featrues[:-num_validation_samples]
right_x_val = right_texts_featrues[-num_validation_samples:]

y_train = y[:-num_validation_samples]
y_val = y[-num_validation_samples:]

In [29]:
train_inputs = []
for i in range(len(left_x_train)):
    train_inputs.append(left_x_train[i] + right_x_train[i])
    
val_inputs = []
for i in range(len(left_x_val)):
    val_inputs.append(left_x_val[i] + right_x_val[i])

test_inputs = []
for i in range(len(test_left_padded_features)):
    test_inputs.append([test_left_padded_features[i], test_right_texts_featrues[i]])

In [16]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [18]:
len(train_inputs), len(y)

(17120, 21400)

In [30]:
n_splits = 10
seed = 7

kfold = KFold(n_splits=n_splits, random_state=seed)
model = SVR()
scoring = 'neg_log_loss'
model.fit(train_inputs, y_train)
result = model.predict(val_inputs)

print("svm done")

svm done


In [32]:
len(result)

4280

In [44]:
import numpy as np

# new metric
def logloss(y_true, y_pred):
    return (y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))

In [49]:
y = logloss(np.array(y_val), np.array(result))

  """


In [58]:
for i in result:
    print(i)

0.2854065727553324
0.9000490455436165
0.2854065727553324
0.2854065727553324
0.9001161539169347
0.2854065727552919
0.2854065727553324
0.2854065727553324
0.28540663232296015
0.2854065727553324
0.09979831028889843
0.2854065727553324
0.2854065727553324
0.28540642147027984
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.9004146172842789
0.2854065727553324
0.08853585249719167
0.28434941456815627
0.2854065727553324
0.28533043564082217
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.09996246573786294
0.2854065727553324
0.10009470043830873
0.2854065727553324
0.09981997678618318
0.2854065727553324
0.2854065727553324
0.2854065727487728
0.2854065727553324
0.8999877708265487
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.6

0.10024012963941081
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.09956778250893758
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553566
0.9004085961288095
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2876040212461357
0.12595306123744657
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2851086668289438
0.2854065727553324
0.28540481781863897
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.10014978090206333
0.25576751836790723
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.10011909322748752
0.2854065727553324
0.10316358104309648
0.09989454217062499
0.2854065727553324
0.2854065727553324
0.2

0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.16069550095862084
0.2853864409521719
0.2854065727553324
0.28390881740643065
0.28540657275533227
0.2854065727553324
0.28540657229710875
0.4447470112503954
0.2854065830940838
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.28540657275528675
0.2854065727553324
0.2854065727553183
0.2854065727553324
0.6227280407428395
0.10014590101670084
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.10005607485871362
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.8999467189625321
0.2893564677005996
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.8996073809260567
0.2854065727553324
0.2854065727553324
0.285

0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.10014602505694523
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.1837646238454162
0.22177837652905963
0.09966992643456413
0.2854065727553324
0.6039532154240657
0.2854065727553324
0.2854065727553324
0.2854065727546052
0.2854065464814272
0.28540657275533265
0.2854065727553324
0.2854065727553324
0.2854065727555616
0.2854065727553324
0.09998086149791277
0.2854065727553324
0.2854065727553324
0.10015036582482079
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.43449549737024706
0.0999496529294113
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.899581933687575
0.2854065727553324
0.9000

0.2854065727553324
0.27832299204506494
0.2854065727553324
0.1001179792591462
0.2854065727553324
0.9001909851354386
0.2854056251258595
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.285406572755345
0.2854065727553324
0.099683453375884
0.28456862993637244
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.900306959266302
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2817099436556305
0.6172396265995302
0.2854065727553324
0.2854065727553324
0.2854065725486155
0.2854065727553324
0.2854065727553324
0.10014999068000008
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.7562551554732793
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.8147492014247828
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065744

0.10026040802289635
0.2854065727553324
0.0996789930734884
0.2854065753043938
0.2854065727553562
0.2854065727553324
0.2854065727553324
0.9003807188778681
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727061756
0.2854065727553324
0.2854065727553324
0.1001524975291622
0.10051819091438638
0.2854065727553324
0.2854065727553324
0.10052218196013177
0.2854065727553324
0.09959470964807413
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.10045345899988828
0.28540657275520603
0.2850113295092885
0.2854061207029729
0.09978274075840912
0.1002307319552864
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854054560102244
0.281049722027361
0.2854065727553324
0.2854065727552728
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.29616066962678417
0.099

0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.28540664309803637
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.285406572

0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2852219538324927
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065730088362
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.2854065727553324
0.28540656383966023
0.285406572

In [53]:
output

nan