First we import some modules and set some paths.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np

from feature import get_feature, get_tokenizer
from model import train_model
from config import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## load

Then we read the input from the training files.

In [3]:
df_en_train = pd.read_csv(english_train_path, sep='\t', names=['en0', 'es0', 'en1', 'es1', 'label'])

df_es_train = pd.read_csv(spanish_train_path, sep='\t', names=['es0', 'en0', 'es1', 'en1', 'label'])
df_es2en = pd.read_csv(unlabel_spanish_train_path, sep='\t', names=['es', 'en'])
df_test = pd.read_csv(test_path, sep='\t', names=['es0', 'es1'])

In [4]:
len(df_es_train), len(df_test)

(1400, 5000)

In [5]:
tokenizer = get_tokenizer([df_es_train['es0'], df_es_train['es1'], df_test['es0'], df_test['es1']])    

## Feature

Now we begin to generate some features.

In [6]:
from feature import *

In [7]:
df_es_train = get_feature(df_es_train, tokenizer)

max_features: 3000 len(word_index): 4479


In [8]:
df_es_train

Unnamed: 0,es0,en0,es1,en1,label,seq_es0,seq_es1,word2vec_es0,word2vec_es1,word2vec_dot,word2vec_minkowski_1,word2vec_minkowski_2,ratio,partial_ratio,token_sort_ratio,token_set_ratio,jaccard
0,"?No he podido pagar con mi tarjeta, que debo h...","I have not been able to pay with my card, what...",No puedo pagar mi pedido con mi tarjeta.,I can not pay for my order with my card.,1,"[3, 14, 501, 34, 13, 4, 39, 6, 134, 30]","[3, 11, 34, 4, 5, 13, 4, 39]","[-0.09848111020401121, -0.3257103390060365, -0...","[-0.08322976168710738, -0.34989594703074545, -...",0.925530,12.955,0.925,66,75,60,83,0.416667
1,"?Por qué aparece ""no pagado"" cuando el pago ya...","Why does it appear ""not paid"" when the payment...","He pagado por transferencia bancaria, pero el ...","I paid by bank transfer, but the payment has n...",0,"[17, 25, 70, 3, 61, 79, 1, 21, 40, 19, 56, 657...","[14, 61, 17, 182, 149, 23, 1, 21, 3, 19, 56, 28]","[-0.08723050633755823, -0.15971393374105294, -...","[-0.061844552925322205, -0.1655001516919583, -...",0.906559,11.997,0.866,58,61,57,68,0.350000
2,?Cuándo recibiré mi reembolso si pago con tarj...,When will I receive my refund if I pay by cred...,¿Cuándo recibiré el reembolso si cancelo mi pe...,When will I receive the refund if I cancel my ...,0,"[209, 192, 4, 27, 37, 21, 13, 39, 2, 67]","[126, 192, 1, 27, 37, 174, 4, 5]","[-0.11918800044804811, -0.15604545725509525, -...","[-0.021451888023875654, -0.25099424785003066, ...",0.865520,16.677,1.222,69,75,54,65,0.285714
3,?Qué pasará después de abrir una disputa?,What will happen after opening a dispute?,¿Qué pasará después de haber enviado mi solici...,What will happen after I have sent my Warranty...,0,"[25, 791, 105, 2, 59, 15, 24]","[49, 791, 105, 2, 362, 80, 4, 348, 2, 183, 2, ...","[-0.01832228366817747, -0.0315937123128346, -0...","[0.008612751339872679, -0.11249358331163724, -...",0.795804,17.385,1.238,58,72,55,60,0.214286
4,El producto que he recibido no corresponde con...,The product I received does not correspond wit...,He recibido un producto que no funciona.,I received a product that does not work.,0,"[1, 31, 6, 14, 28, 3, 836, 13, 9, 792, 25, 11,...","[14, 28, 7, 31, 6, 3, 151]","[-0.08463253940527256, -0.14742532143226036, -...","[-0.026241430746657506, -0.1894456585869193, -...",0.899695,11.822,0.875,47,65,59,80,0.333333
5,?Por qué se ha cerrado mi pedido?,Why has my order closed?,¿Por qué mi pedido está cerrado por Pedido rec...,Why is my order closed by rejected Order?,0,"[17, 25, 22, 19, 129, 4, 5]","[44, 25, 4, 5, 51, 129, 17, 5, 769]","[0.03419442927198751, -0.140893142670393, -0.1...","[0.03805022334886922, -0.17470577731728554, -0...",0.942257,9.976,0.744,68,70,59,81,0.500000
6,?Por qué no puedo realizar un pedido?,Why can not I place an order?,¿Puedo cancelar un pedido pagado por adelantado?,Can I cancel an order paid in advance?,0,"[17, 25, 3, 11, 132, 7, 5]","[78, 89, 7, 5, 61, 17, 1296]","[-0.06148357183805534, -0.1879522848342146, -0...","[-0.029315711398209845, -0.10635985913021224, ...",0.836799,17.403,1.264,41,38,53,58,0.272727
7,?Por qué mi pago está deducido incorrectamente?,Why is my payment incorrectly deducted?,¿Por qué mi pago está deducido incorrectamente?,Why is my payment incorrectly deducted?,1,"[17, 25, 4, 21, 51, 657, 1362]","[44, 25, 4, 21, 51, 657, 1362]","[-0.08008042696331229, -0.16622642720384256, -...","[-0.08259456977248192, -0.1654464259211506, -0...",0.965622,8.375,0.608,93,93,86,93,0.750000
8,?Como puedo pagar con mi tarjeta de credito/de...,How can I pay with my credit / debit card?,No puedo pagar mi pedido con tarjeta Visa de d...,I can not pay for my order with a Visa debit c...,0,"[38, 11, 34, 13, 4, 39, 2, 305, 753]","[3, 11, 34, 4, 5, 13, 39, 169, 2, 199, 49, 11,...","[-0.10482134266446035, -0.2628592876717448, -0...","[-0.10374700686392876, -0.2647035817544048, -0...",0.922838,12.597,0.896,60,64,65,73,0.400000
9,?Por qué mi pedido está en verificación tras e...,Why is my order in verification after payment?,¿En qué etapa está mi pedido?,At what stage is my order?,0,"[17, 25, 4, 5, 51, 12, 367, 344, 1, 21]","[451, 25, 51, 4, 5]","[-0.042997299786657096, -0.12093540951609612, ...","[0.03579020202159881, -0.19327099844813347, -0...",0.831887,20.238,1.462,55,63,54,82,0.363636


In [9]:
df_es_train['word2vec_euclidean'] = df_es_train.apply(lambda row: np.linalg.norm(row['word2vec_es1']-row['word2vec_es0']), axis=1)


In [10]:
[x for x in df_es_train['word2vec_dot'] if x < 0.5]

[0.4400021691476747, 0.43348914781443015]

## Model

In [16]:
from model import lgb_modelfit_nocv, train_model

In [17]:
predictors = ['word2vec_dot']
best_model,best_iteration = train_model(df_es_train, predictors)

preparing validation datasets
Training until validation scores don't improve for 30 rounds.
[10]	valid's binary_logloss: 0.562413
[20]	valid's binary_logloss: 0.762468
[30]	valid's binary_logloss: 0.948914
Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.408277

Model Report
bst1.best_iteration:  1
binary_logloss: 0.4082766336763148


In [13]:
feature_test = get_feature(df_test, tokenizer)

max_features: 3000 len(word_index): 4479


In [14]:
sub = pd.DataFrame()
sub['result'] = best_model.predict(feature_test[predictors],num_iteration=best_iteration)

In [15]:
sub.to_csv('../output/submission.txt',index=False,header=False,float_format='%.9f')