# Notebook para STS 2017

Experimentos de evaluación y etiquetado de datos

In [2]:
import time
import os
import random
import re
import codecs
import numpy as np
from collections import Counter

def verbose(*args):
    print(" ".join([str(a) for a in args]))

class Opts:
    verbose=False
    filter_test=".*"
    
opts=Opts()

# Código para EMBEDINGS

Este código carga los embeddings de Glove y declara una función para crear la matrix sincronizada despues

* 0 para padding
* 1 para inición de oración
* 2 Para palabras desconocidas (OOV)
* 3 inicia la numeración de palabras

In [3]:
EMBEDDING_DIM=300
GLOVE_DIR='.'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.42B.'+str(EMBEDDING_DIM)+'d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
embeddings_index['###'] = np.zeros(100)

def create_embedding_matrix(word_index,edim,embeddings_index,nb_words):
    # prepare embedding matrix
    embedding_matrix = np.zeros((nb_words + 4, edim))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i+3] = embedding_vector

    return embedding_matrix

Found 1917494 word vectors.


# Diferentes funciones para cargar datos STS1

In [5]:
def load_phrases_from_file(dirname,filename,format='2017',translation=False):
    re_file=re.compile('.*\.input\..*\.txt$')                                        
    re_file_translation=re.compile('.*\.input\..*\.translation.txt$')
    
    if translation:                                                              
        re_file=re_file_translation
        
    phrases=[]                                                                   
    if not re_file.match(filename):                                              
        return []                                                                
    with codecs.open(os.path.join(dirname,filename),encoding='utf-8') as data:
        for line in data:
            bits=line.strip().split('\t')
            if len(bits)>=2 or len(bits)<=4:                                     
                if not format:                                                   
                    phrases.append((bits[0],bits[1]))                            
                elif format=="2017":                                             
                    phrases.append((bits[0],bits[1]))
    return phrases 

def load_gs_from_file(dirname,filename,format=None):
    re_gs=re.compile('.*\.gs\..*\.(txt|ascii)$')
    re_file_translation=re.compile('.*\.input\..*\.translation.txt$')
    gs=[]
    if not format:
        if not re_gs.match(filename):
            return []
    elif format=="2017":
        if not re_file_translation.match(filename):
            return []

    with open(os.path.join(dirname,filename)) as data:
        for line in data:
            line=line.strip()
            try:
                if not format:
                    gs.append(float(line))
                elif format=="2017":
                    bits=line.strip().split('\t')
                    if len(bits)>=2 or len(bits)<=4:                                                                                    
                        gs.append(float(bits[1])) 
            except ValueError:
                gs.append(None)
    return gs

def load_all_phrases(dirname,filter=".",format=None,translation=False):          
    all_phrases=[]                                                               
    filter_dirs=re.compile(filter)
    for filename in os.listdir(dirname):
        if not filter_dirs.search(filename):                                     
            continue
        phrases=load_phrases_from_file(dirname,filename,format=format,translation=translation)
        if len(phrases)>0: 
            all_phrases.append((filename,phrases))                               
    return all_phrases

def load_all_gs(dirname,format=None):
    all_gs=[]
    for filename in os.listdir(dirname):
        gs=load_gs_from_file(dirname,filename,format=format)
        if len(gs)>0:
            all_gs.append((filename,gs))
    return all_gs

def load_train_dirs(dirs,dir="train"):
    train_data=[]
    gs_data=[]
    for directory,format,translation in dirs: 
        verbose('Starting load '+dir+'ing',directory)
        train_data_=load_all_phrases(os.path.join(directory,dir),format=format,translation=translation)
        gs_data_=dict(load_all_gs(os.path.join(directory,dir),format=format))
        for (n,d) in train_data_:
            if not format:
                n_=n.replace('input', 'gs')
            else:
                n_=n
            if translation and not format:
                n_=n_.replace('.translation', '')
            for i,s in enumerate(d):
                if gs_data_[n_][i]:
                    train_data.append(s[0])
                    train_data.append(s[1])
                    gs_data.append(gs_data_[n_][i])
            verbose("Phrases in",n,len(d),len(gs_data_[n_]))
        verbose('Total train phrases',directory,sum([len(d) for n,d in train_data_]))
        verbose('Total train phrases',len(train_data))
    return train_data,gs_data

def infer_test_file(dirname_gs,filename_sys):
    filename=os.path.basename(filename_sys)
    bits=filename.split('.')
    h,t=os.path.split(dirname_gs)
    h,year=os.path.split(h)
    filename_gs=os.path.join(dirname_gs,
                bits[0]+'.'+year+'.gs.'+bits[3]+'.txt'
        )
    return filename_gs


def load_test_dirs(dirs,dir="test"):
    train_data={}
    gs_data=[]
    gs_files={}
    for directory,format,translation in dirs:
        verbose('Starting loading test',directory,translation)
        train_data_=load_all_phrases(os.path.join(directory,'test'),format=format,translation=translation)
        for n,d in train_data_:
            train_data[n]=[]
            gs_files[n]=infer_test_file(os.path.join(directory,'test'),n)
            for i,s in enumerate(d):
                train_data[n].append(s[0])
                train_data[n].append(s[1])
            verbose("Total phraes in",n,len(d))
        verbose('Total test phrases',directory,sum([len(d) for n,d in train_data_]))
    return train_data,gs_files
        
def prepare_data_concatenation(data_,gs_=None,test=False):
    train_data=[]
    for i in range(int(len(data_)/2)):
        train_data.append(data_[i*2]+" ### "+data_[i*2+1])
        #if not test:
        #    train_data.append(data_[i*2+1]+" ### "+data_[i*2])
    if not gs_:
        return train_data
    else:
        gs_data=[]
        for i in range(len(gs_)):
            gs_data.append(gs_[i])
            if not test:
                gs_data.append(gs_[i])
        return train_data,gs_data

import re
punct=re.compile(r"([,.?\)\(!-;\"'])")

def prepare_sentence(sntc):
    return punct.sub(" punt\\1 ",sntc)
    
    
def prepare_data_separated(data_,gs_=None,test=False):
    train_data1=[]
    train_data2=[]
    for i in range(int(len(data_)/2)):
        train_data1.append(prepare_sentence(data_[i*2]))
        train_data2.append(prepare_sentence(data_[i*2+1]))
        if not test:
            train_data1.append(prepare_sentence(data_[i*2+1]))
            train_data2.append(prepare_sentence(data_[i*2]))
    if not gs_:
        return train_data1,train_data2
    else:
        gs_data=[]
        for i in range(len(gs_)):
            gs_data.append(gs_[i])
            if not test:
                gs_data.append(gs_[i])
        return train_data1,train_data2,gs_data

# Declaración de variables

Dos secciones E para evaluación con datos de 2016 y data 2017

In [6]:
EYEAR="2016"
YEAR="2017"
ETRAIN_DIRS=[
    ("../english_testbed/data/"+EYEAR,None,False)]

ETEST_DIRS=[
    ("../english_testbed/data/"+EYEAR,None,False)]


TRAIN_DIRS=[
    ("../arabic_testbed/data/"+YEAR,"2017",True),
    ("../english_arabic_testbed/data/"+YEAR,"2017",True),
    ("../english_spanish_testbed/data/"+YEAR,None,True),
    ("../english_testbed/data/"+YEAR,None,False),
    ("../spanish_testbed/data/"+YEAR,None,True),
    ]
TEST_DIRS=[
    ("../arabic_testbed/data/"+YEAR,"2017",True),
    ("../english_arabic_testbed/data/"+YEAR,"2017",True),
    ("../english_spanish_testbed/data/"+YEAR,None,True),
    ("../english_testbed/data/"+YEAR,None,False),
    ("../english_turkish_testbed/data/"+YEAR,"2017",True),
    ("../spanish_testbed/data/"+YEAR,None,True),
]

GS_FILES={}

# Carga training para evaluación (desarrollo)

In [7]:
# LOADING EVALUATION TRAINING
etrain_data_,egs_data_=load_train_dirs(ETRAIN_DIRS)

print("Avg size:",np.mean([len(x.split()) for x in etrain_data_]))
print("Max size:",np.max([len(x.split()) for x in etrain_data_]))
print("Min size:",np.min([len(x.split()) for x in etrain_data_]))

etrain_data1,etrain_data2,egs_data=prepare_data_separated(etrain_data_,egs_data_)
    
print("Avg size after merging:",np.mean([len(x.split()) for x in etrain_data1]))
print("Max size after merging:",np.max([len(x.split()) for x in etrain_data1]))
print("Min size after merging:",np.min([len(x.split()) for x in etrain_data1]))
print("Total examples",len(etrain_data1))
print("Total labels",len(egs_data))


Starting load training ../english_testbed/data/2016
Phrases in STS.2013.test.input.headlines.txt 750 750
Phrases in STS.2012.train.input.SMTeuroparl.txt 734 734
Phrases in STS.2015.input.images.txt 1500 1500
Phrases in STS.2014.test.input.images.txt 750 750
Phrases in STS.2015.input.answers-students.txt 1500 1500
Phrases in STS.2015.input.headlines.txt 1500 1500
Phrases in STS.2012.test.input.MSRvid.txt 750 750
Phrases in STS.2012.test.input.MSRpar.txt 750 750
Phrases in STS.2012.test.input.SMTeuroparl.txt 459 459
Phrases in STS.2012.train.input.MSRpar.txt 750 750
Phrases in STS.2014.test.input.OnWN.txt 750 750
Phrases in STS.2012.train.input.MSRvid.txt 750 750
Phrases in STS.2012.test.input.surprise.SMTnews.txt 399 399
Phrases in STS.2014.test.input.deft-news.txt 300 300
Phrases in STS.2013.test.input.OnWN.txt 561 561
Phrases in STS.2014.test.input.headlines.txt 750 750
Phrases in STS.2013.test.input.FNWN.txt 189 189
Phrases in STS.2014.test.input.deft-forum.txt 450 450
Phrases in STS

# Carga test para evaluación (desarrollo)

In [8]:
# LOADING EVALUATION TESTING
etest_data_, egs_files=load_test_dirs(ETEST_DIRS,dir="test")
    
print("Total evaluating files",len(etest_data_))
print("Total candidates gs files",len(egs_files))

Starting loading test ../english_testbed/data/2016 False
Total phraes in STS.2016.input.postediting.txt 3287
Total phraes in STS.2016.input.question-question.txt 1555
Total phraes in STS.2016.input.answer-answer.txt 1572
Total phraes in STS.2016.input.plagiarism.txt 1271
Total phraes in STS.2016.input.headlines.txt 1498
Total test phrases ../english_testbed/data/2016 9183
Total evaluating files 5
Total candidates gs files 5


# Carga datos de entrenamiento (2017)

In [9]:
# LOADING TRAINING
train_data_,gs_data_=load_train_dirs(TRAIN_DIRS)

print("Avg size:",np.mean([len(x.split()) for x in train_data_]))
print("Max size:",np.max([len(x.split()) for x in train_data_]))
print("Min size:",np.min([len(x.split()) for x in train_data_]))

train_data1,train_data2,gs_data=prepare_data_separated(train_data_,gs_data_)
    
print("Avg size after merging:",np.mean([len(x.split()) for x in train_data1]))
print("Max size after merging:",np.max([len(x.split()) for x in train_data1]))
print("Min size after merging:",np.min([len(x.split()) for x in train_data1]))
print("Total examples",len(train_data1))
print("Total labels",len(gs_data))

Starting load training ../arabic_testbed/data/2017
Phrases in STS.2017.input.SMTeuroparl.translation.txt 203 203
Phrases in STS.2017.input.MSRpar.translation.txt 510 510
Total train phrases ../arabic_testbed/data/2017 713
Total train phrases 0
Starting load training ../english_arabic_testbed/data/2017
Phrases in STS.2017.input.SMTeuroparl.translation.txt 406 406
Phrases in STS.2017.input.MSRpar.translation.txt 1020 1020
Phrases in STS.2017.input.MSRvid.translation.txt 736 736
Total train phrases ../english_arabic_testbed/data/2017 2162
Total train phrases 0
Starting load training ../english_spanish_testbed/data/2017
Phrases in STS.2016.input.multisource.translation.txt 294 294
Phrases in STS.2016.input.news.translation.txt 301 301
Total train phrases ../english_spanish_testbed/data/2017 595
Total train phrases 1034
Starting load training ../english_testbed/data/2017
Phrases in STS.2013.test.input.headlines.txt 750 750
Phrases in STS.2012.train.input.SMTeuroparl.txt 734 734
Phrases in S

# Carga datos de prueba 2017

In [10]:
test_data_, gs_files =load_test_dirs(TEST_DIRS)
print("Total examples",len(test_data_))

Starting loading test ../arabic_testbed/data/2017 True
Total phraes in STS.2017.input.track1.ar-ar.translation.txt 250
Total test phrases ../arabic_testbed/data/2017 250
Starting loading test ../english_arabic_testbed/data/2017 True
Total phraes in STS.2017.input.track2.ar-en.translation.txt 250
Total test phrases ../english_arabic_testbed/data/2017 250
Starting loading test ../english_spanish_testbed/data/2017 True
Total phraes in STS.2017.input.track4a.es-en.translation.txt 250
Total phraes in STS.2017.input.track4b.es-en.translation.txt 250
Total test phrases ../english_spanish_testbed/data/2017 500
Starting loading test ../english_testbed/data/2017 False
Total phraes in STS.2017.input.track5.en-en.txt 250
Total test phrases ../english_testbed/data/2017 250
Starting loading test ../english_turkish_testbed/data/2017 True
Total phraes in STS.2017.input.track6.tr-en.translation.txt 500
Total test phrases ../english_turkish_testbed/data/2017 500
Starting loading test ../spanish_testbed/

# Función para preparar secuencias en Keras

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

MAX_NB_WORDS=15000
MAX_SEQUENCE_LENGTH=30
VALIDATION_SPLIT=0.1
LSTM_DIM=100

def texts_to_sequences(word_index,nb_words,xs):
    nb_words = min(nb_words, len(word_index))
    xs_=[]
    for x in xs:
        x_=[1]
        for w in x.lower().split():
            i=word_index.get(w)
            if i: 
                if i>nb_words:
                    x_.append(2)
                else:
                    x_.append(i+3)
            else:
                x_.append(2)
        xs_.append(x_)
    return xs_,nb_words
    
def skipodd(vals):
    return vals[range(0,vals.shape[0],2)]
        
def prepare_keras_data(train_data1,train_data2,validation_split=VALIDATION_SPLIT,tokenizer=None,gs_data=None,nb_words=None):
    print('Shape of data1 tensor:', len(train_data1))
    print('Shape of data2 tensor:', len(train_data2))
    if not tokenizer:
        tokenizer = Tokenizer(nb_words=nb_words)
        tokenizer.fit_on_texts(train_data1+train_data2)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    sequences1,nb_words = texts_to_sequences(word_index,MAX_NB_WORDS,train_data1)
    sequences2,nb_words = texts_to_sequences(word_index,MAX_NB_WORDS,train_data2)
    print("Avg seq 1",np.mean([len(seq) for seq in sequences1]))
    print("Avg seq 2",np.mean([len(seq) for seq in sequences2]))
    print("Max seq 1",np.max([len(seq) for seq in sequences1]))
    print("Max seq 2",np.max([len(seq) for seq in sequences2]))   
    
    data1 = pad_sequences(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
    data2 = pad_sequences(sequences2, maxlen=MAX_SEQUENCE_LENGTH)
    #data=np.zeros((data_.shape[0]/2,data_.shape[1]*2))
    #for i in range(data_.shape[0]/2):
    #    data[i,:data_.shape[1]]=data_[2*i]
    #    data[i,data_.shape[1]:]=data_[2*i+1]

    print('Shape of data1 tensor:', data1.shape)
    print('Shape of data2 tensor:', data2.shape)
    if gs_data:
        print(gs_data[:10])
        #labels = to_categorical(np.asarray(gs_data))
        labels = np.asarray(gs_data)
        print('Shape of label tensor:', labels.shape)

    
    if validation_split!= 0.0:
        # split the data into a training set and a validation set
        indices_ = np.arange(int(data1.shape[0]/2))
        np.random.shuffle(indices_)
        indices=[]
        for i in indices_:
            indices.append(2*indices_[i])
            indices.append(2*indices_[i]+1)
        print("Indices",indices[:10])
        data1 = data1[indices]
        data2 = data2[indices]
        if gs_data:
            labels = labels[indices]
        nb_validation_samples = int(validation_split * data1.shape[0])
        indices_ = np.arange(len(data1)-nb_validation_samples)
        np.random.shuffle(indices_)
        x1_train = data1[:-nb_validation_samples][indices_]
        x2_train = data2[:-nb_validation_samples][indices_]
        y_train = labels[:-nb_validation_samples][indices_]
        x1_val = skipodd(data1[-nb_validation_samples:])
        x2_val = skipodd(data2[-nb_validation_samples:])
        y_val = skipodd(labels[-nb_validation_samples:])

        print('Shape of X1 train:',x1_train.shape)
        print('Shape of X2 train:',x2_train.shape)
        print('Shape of y train:',y_train.shape)
        print('Shape of X1 test :',x1_val.shape)
        print('Shape of X2 test :',x2_val.shape)
        print( 'Shape of y test :',y_val.shape)
    
        return x1_train,x2_train,y_train,x1_val,x2_val,y_val, word_index, tokenizer, nb_words
    else:
        x1_train = data1
        x2_train = data2
        print('Shape of X1 train:'),x1_train.shape
        print('Shape of X2 train:'),x2_train.shape
        return x1_train,x2_train, word_index, tokenizer, nb_words
        

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, cuDNN not available)


# Llamado a funicón para preparar datos

In [12]:
x1_etrain,x2_etrain,y_etrain,x1_eval,x2_eval,y_eval, eword_index, etok, nb_ewords =\
    prepare_keras_data(etrain_data1,etrain_data2,gs_data=egs_data,nb_words=MAX_NB_WORDS)

Shape of data1 tensor: 26958
Shape of data2 tensor: 26958
Found 17861 unique tokens.
Avg seq 1 15.2497959789
Avg seq 2 15.2497959789
Max seq 1 115
Max seq 2 115
Shape of data1 tensor: (26958, 30)
Shape of data2 tensor: (26958, 30)
[2.6, 2.6, 4.4, 4.4, 2.6, 2.6, 3.8, 3.8, 4.2, 4.2]
Shape of label tensor: (26958,)
Indices [3274, 3275, 656, 657, 13834, 13835, 12282, 12283, 9608, 9609]
Shape of X1 train: (24263, 30)
Shape of X2 train: (24263, 30)
Shape of y train: (24263,)
Shape of X1 test : (1348, 30)
Shape of X2 test : (1348, 30)
Shape of y test : (1348,)


# Llamado de función para preparar matrix de datos de embedding

In [13]:
ematrix=create_embedding_matrix(eword_index,EMBEDDING_DIM,embeddings_index,nb_ewords)
print("Embedding matrix evaluation", ematrix.shape)
#ematrix= np.zeros((nb_ewords + 4, EMBEDDING_DIM))

Embedding matrix evaluation (15004, 300)


# Modelo de STS en keras

In [14]:
from keras.layers import Embedding,Merge
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM,Bidirectional, Flatten, Lambda, MaxoutDense, Activation, Reshape, Dropout
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU

def create_model(word_index,matrix,nb_words):
    embedding_layer1 = Embedding(nb_words + 4,
                           EMBEDDING_DIM,
                            weights=[matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            dropout=0.2,
                            trainable=False)
    embedding_layer2 = Embedding(nb_words + 4,
                            EMBEDDING_DIM,
                            weights=[matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            dropout=0.2,
                            trainable=False)
    
    sentence_A = Sequential()
    sentence_A.add(embedding_layer1)
    #sentence_A.add(Conv1D(128,3,activation='relu'))
    sentence_A.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
                             dropout_W=0.2,return_sequences=True,
                             dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    sentence_A.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
                             dropout_W=0.2,#return_sequences=True,
                             dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_A.add(Conv1D(64,3,activation='relu'))
    #sentence_A.add(MaxPooling1D(2))
    #sentence_A.add(Bidirectional(LSTM(output_dim=int(LSTM_DIM/4),
    #                         dropout_W=0.2,return_sequences=True,
    #                         dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_A.add(Flatten())

    #sentence_A.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
    #                         dropout_W=0.2,
    #                         dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_A.add(Dense(80))                  


    sentence_B = Sequential()
    sentence_B.add(embedding_layer2)
    #sentence_B.add(Conv1D(128,3,activation='relu'))
    sentence_B.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
                             dropout_W=0.2,return_sequences=True,
                             dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    sentence_B.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
                             dropout_W=0.2,#return_sequences=True,
                             dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_B.add(Conv1D(64,3,activation='relu'))
    #sentence_B.add(MaxPooling1D(2))
    #sentence_B.add(Bidirectional(LSTM(output_dim=int(LSTM_DIM/4),
    #                         dropout_W=0.2,return_sequences=True,
    #                         dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_B.add(Flatten())
    
    #sentence_B.add(Bidirectional(LSTM(output_dim=LSTM_DIM,
    #                         dropout_W=0.2,
    #                         dropout_U=0.2,input_length=MAX_SEQUENCE_LENGTH)))
    #sentence_B.add(Dense(80))                      

    pair_sent = Merge([sentence_A, sentence_B],mode='concat')
    

    similarity = Sequential()
    similarity.add(pair_sent)
    #similarity.add(Dense(100))
    similarity.add(MaxoutDense(300))
    similarity.add(MaxoutDense(100))
    similarity.add(MaxoutDense(1))
    #similarity.add(PReLU())
    
    #similarity.add(Dense(1))
    #similarity.add(Activation('linear'))
    similarity.compile(loss='mean_absolute_error', optimizer='rmsprop', metrics=['mean_absolute_error','mean_squared_error'])
    #similarity.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    print(similarity.summary())

    
    #model = Sequential()
    #model.add(embedding_layer)
    #model.add(Conv1D(128,5,activation='relu'))
    #model.add(Conv1D(32,3,activation='relu'))
    #model.add(MaxPooling1D(5))
    #model.add(Dropout(0.2))
    #model.add(Bidirectional(LSTM(100, dropout_W=0.2, dropout_U=0.2)))
    #model.add(Dense(6, activation='softmax'))
    #model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    #print(model.summary())
    return similarity,sentence_A,sentence_B


# Creación de modelo para evaluación (desarrollo)

In [15]:
import gc
for i in range(3): gc.collect()
# happy learning!
model,p1,p2 = create_model(eword_index,ematrix,nb_ewords)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 30, 300)       4501200                                      
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 30, 200)       320800                                       
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 200)           240800                                       
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 30, 300)       4501200                                      
___________________________________________________________________________________________

# Entrenamiento para evaluación (desarrollo)

In [16]:
model.fit([x1_etrain,x2_etrain], y_etrain, validation_data=([x1_eval,x2_eval], y_eval),
          nb_epoch=5, batch_size=1024)

Train on 24263 samples, validate on 1348 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa625f096a0>

# Evaluación sobre datos de prueba (2016 inglés)

In [17]:
# EVALUATING
cmd=["perl",
    "../english_testbed/data/2015/evaluate/correlation-noconfidence.pl"]
from subprocess import Popen, PIPE, STDOUT

def eval_tmp(cmd,filename_gs,filename_sys): 
    cmd=cmd+[filename_gs,filename_sys]     
    p = Popen(cmd,  stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()
    res=stdout.decode("utf-8") 
    res=res.replace('Pearson: ','').strip()
    return float(res)

for dir,data in etest_data_.items():
    print("Evaluating",dir,len(data))
    data1,data2=prepare_data_separated (data,test=True)
    print("Size data1",len(data1))
    print("Size data2",len(data2))
    print(data1[28])
    print(data2[28])
    x1_test_eval,x2_test_eval, eword_index, etok, _ = prepare_keras_data(data1,data2,tokenizer=etok, validation_split=0.0)
    print("Size data",len(data1))
    print(x1_test_eval[4])
    print(x2_test_eval[4])
    res=model.predict([x1_test_eval,x2_test_eval])
    res=np.clip(res,0,5)
    #res=[np.argmax(x) for x in res]
    print(res[:10])
    filename=os.path.join('.',dir)
    fn=open(filename,'w')
    for num in res:
        print( "{0:1.1f}".format(num[0]),file=fn)
    fn.close()
    print(eval_tmp(cmd,egs_files[dir],filename))


Evaluating STS.2016.input.answer-answer.txt 3144
Size data1 1572
Size data2 1572
There punt' s also what the string is made of punt. 
There is also a Youtube punt- Version of the film punt. 
Shape of data1 tensor: 1572
Shape of data2 tensor: 1572
Found 17861 unique tokens.
Avg seq 1 12.6444020356
Avg seq 2 12.596692112
Max seq 1 32
Max seq 2 31
Shape of data1 tensor: (1572, 30)
Shape of data2 tensor: (1572, 30)
Shape of X1 train:
Shape of X2 train:
Size data 1572
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   1  32  13 727 103  29 100 540   2]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   1  32 103   5 100 540  26  46   2]
[[ 2.38190007]
 [ 2.61398244]
 [ 2.80528879]
 [ 2.82091236]
 [ 3.12403798]
 [ 2.55776191]
 [ 3.18180346]
 [ 3.271029  ]
 [ 2.50552034]
 [ 2.88644123]]
0.13761
Evaluating STS.2016.input.question-question.txt 3110
Size data1 1555
Size data2 1555
How does arbitration work in baseball punt

# Se preparan datos para training (2017)

In [18]:
print("Loading data for trainig")
x1_train,x2_train,y_train,x1_val,x2_val,y_val, word_index, tok, nb_words =\
    prepare_keras_data(train_data1,train_data2,gs_data=gs_data,nb_words=MAX_NB_WORDS)

matrix=create_embedding_matrix(word_index,EMBEDDING_DIM,embeddings_index,nb_words)
print("Embedding matrix evaluation", ematrix.shape)

Loading data for trainig
Shape of data1 tensor: 42906
Shape of data2 tensor: 42906
Found 22899 unique tokens.
Avg seq 1 17.6131310306
Avg seq 2 17.6131310306
Max seq 1 115
Max seq 2 115
Shape of data1 tensor: (42906, 30)
Shape of data2 tensor: (42906, 30)
[1.0, 1.0, 1.4, 1.4, 2.2, 2.2, 2.4, 2.4, 0.2, 0.2]
Shape of label tensor: (42906,)
Indices [18036, 18037, 35608, 35609, 26902, 26903, 33584, 33585, 12764, 12765]
Shape of X1 train: (38616, 30)
Shape of X2 train: (38616, 30)
Shape of y train: (38616,)
Shape of X1 test : (2145, 30)
Shape of X2 test : (2145, 30)
Shape of y test : (2145,)
Embedding matrix evaluation (15004, 300)


# Se carga modelo y se entrena con (2017)

In [20]:
print("Trainig")
import gc
for i in range(3): gc.collect()
# happy learning!
model_final,_,_ = create_model(word_index,matrix,nb_words)

model_final.fit([x1_train,x2_train], y_train, validation_data=([x1_val,x2_val], y_val),
          nb_epoch=5, batch_size=1024)

Trainig
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 30, 300)       4501200                                      
____________________________________________________________________________________________________
bidirectional_5 (Bidirectional)  (None, 30, 200)       320800                                       
____________________________________________________________________________________________________
bidirectional_6 (Bidirectional)  (None, 200)           240800                                       
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 30, 300)       4501200                                      
___________________________________________________________________________________

<keras.callbacks.History at 0x7fa61038e208>

# Se etiquetan los datos

In [21]:
for dir,data in test_data_.items():
    print("Evaluating",dir,len(data))
    data1_test,data2_test=prepare_data_separated (data,test=True)
    print("Size data1",len(data1_test))
    print("Size data2",len(data2_test))
    print("Sntc 30:",data1_test[30])
    print("Sntc 30:",data2_test[30])
    x1_test,x2_test, word_index, tok, _ = prepare_keras_data(data1_test,data2_test,tokenizer=tok, validation_split=0.0)
    res=model.predict([x1_test_eval,x2_test_eval])
    print("Sequence 30:",x1_test[30])
    print("Sequence 30:",x2_test[30])
    res=model.predict([x1_test,x2_test])
    res=np.clip(res,0,5)
    #res=[np.argmax(x) for x in res]
    print(res[:10])
    filename=os.path.join('.',dir)
    fn=open(filename,'w')
    for num in res:
        print( "{0:1.1f}".format(num[0]),file=fn)
    fn.close()

Evaluating STS.2017.input.track4a.es-en.translation.txt 500
Size data1 250
Size data2 250
Sntc 30: A woman looks at another person punt. 
Sntc 30: A woman is looking at a person punt. 
Shape of data1 tensor: 250
Shape of data2 tensor: 250
Found 22899 unique tokens.
Avg seq 1 10.212
Avg seq 2 10.68
Max seq 1 23
Max seq 2 24
Shape of data1 tensor: (250, 30)
Shape of data2 tensor: (250, 30)
Shape of X1 train:
Shape of X2 train:
Sequence 30: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    1    6   53 2393   30  163  124    2]
Sequence 30: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   1   6  53  11 472  30   6 124   2]
[[ 3.06108546]
 [ 2.16144609]
 [ 2.7401576 ]
 [ 2.6038816 ]
 [ 2.3957541 ]
 [ 2.14492369]
 [ 2.89793825]
 [ 2.54378891]
 [ 3.12350321]
 [ 2.7526865 ]]
Evaluating STS.2017.input.track5.en-en.txt 500
Size data1 250
Size data2 250
Sntc 30: A woman drives a golf cart punt.