In [1]:
#Import libraries, prepare verbose and set options
import time
import os
import random
import re
import codecs
import numpy as np
from collections import Counter

def verbose(*args):
    print(" ".join([str(a) for a in args]))

class Opts:
    verbose=False
    filter_test=".*"
    
opts=Opts()

In [2]:
GLOVE_DIR='.'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
embeddings_index['###'] = np.zeros(100)

def create_embedding_matrix(word_index,edim,embeddings_index):
    embedding_matrix = np.zeros((len(word_index) + 1, edim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

Found 400000 word vectors.


In [3]:
def load_phrases_from_file(dirname,filename,format='2017',translation=False):
    re_file=re.compile('.*\.input\..*\.txt$')                                        
    re_file_translation=re.compile('.*\.input\..*\.translation.txt$')
    
    if translation:                                                              
        re_file=re_file_translation
        
    phrases=[]                                                                   
    if not re_file.match(filename):                                              
        return []                                                                
    with codecs.open(os.path.join(dirname,filename),encoding='utf-8') as data:
        for line in data:
            bits=line.strip().split('\t')
            if len(bits)>=2 or len(bits)<=4:                                     
                if not format:                                                   
                    phrases.append((bits[0],bits[1]))                            
                elif format=="2017":                                             
                    phrases.append((bits[2],bits[3]))
    return phrases 

def load_gs_from_file(dirname,filename):
    re_gs=re.compile('.*\.gs\..*\.(txt|ascii)$')
    gs=[]
    if not re_gs.match(filename):
        return []

    with open(os.path.join(dirname,filename)) as data:
        for line in data:
            line=line.strip()
            try:
                gs.append(float(line))
            except ValueError:
                gs.append(0.0)
    return gs

def load_all_phrases(dirname,filter=".",format=None,translation=False):          
    all_phrases=[]                                                               
    filter_dirs=re.compile(filter)
    for filename in os.listdir(dirname):
        if not filter_dirs.search(filename):                                     
            continue
        phrases=load_phrases_from_file(dirname,filename,format=format,translation=translation)
        if len(phrases)>0:                                                       
            all_phrases.append((filename,phrases))                               
    return all_phrases

def load_all_gs(dirname):
    all_gs=[]
    for filename in os.listdir(dirname):
        gs=load_gs_from_file(dirname,filename)
        if len(gs)>0:
            all_gs.append((filename,gs))
    return all_gs

def load_train_dirs(dirs,dir="train"):
    train_data=[]
    gs_data=[]
    for directory,format,translation in dirs: 
        verbose('Starting load '+dir+'ing',directory)
        train_data_=load_all_phrases(os.path.join(directory,dir),format=format,translation=translation)
        gs_data_=dict(load_all_gs(os.path.join(directory,dir)))

        for (n,d) in train_data_:
            n_=n.replace('input', 'gs')
            if translation:
                n_=n_.replace('.translation', '')
            for i,s in enumerate(d):
                train_data.append(s[0])
                train_data.append(s[1])
                gs_data.append(gs_data_[n_][i])
            verbose("Phrases in",n,len(d),len(gs_data_[n_]))
        verbose('Total train phrases',directory,sum([len(d) for n,d in train_data_]))
        verbose('Total train phrases',len(train_data))
    return train_data,gs_data

def infer_test_file(dirname_gs,filename_sys):
    filename=os.path.basename(filename_sys)
    bits=filename.split('.')
    h,t=os.path.split(dirname_gs)
    h,year=os.path.split(h)
    filename_gs=os.path.join(dirname_gs,
                bits[0]+'.'+year+'.gs.'+bits[3]+'.txt'
        )
    return filename_gs


def load_test_dirs(dirs,dir="test"):
    train_data={}
    gs_data=[]
    gs_files={}
    for directory,format,translation in dirs:
        verbose('Starting loading test',directory,translation)
        train_data_=load_all_phrases(os.path.join(directory,'test'),format=format,translation=translation)
        for n,d in train_data_:
            train_data[n]=[]
            gs_files[n]=infer_test_file(os.path.join(directory,'test'),n)
            for i,s in enumerate(d):
                train_data[n].append(s[0])
                train_data[n].append(s[1])
            verbose("Total phraes in",n,len(d))
        verbose('Total test phrases',directory,sum([len(d) for n,d in train_data_]))
    return train_data,gs_files
        
def prepare_data_concatenation(data_,gs_=None,test=False):
    train_data=[]
    for i in range(int(len(data_)/2)):
        train_data.append(data_[i*2]+" ### "+data_[i*2+1])
        if not test:
            train_data.append(data_[i*2+1]+" ### "+data_[i*2])
    if not gs_:
        return train_data
    else:
        gs_data=[]
        for i in range(len(gs_)):
            gs_data.append(gs_[i])
            if not test:
                gs_data.append(gs_[i])
        return train_data,gs_data
    
def prepare_data_separated(data_,gs_=None,test=False):
    train_data1=[]
    train_data2=[]
    for i in range(int(len(data_)/2)):
        train_data1.append(data_[i*2])
        train_data2.append(data_[i*2+1])
        if not test:
            train_data1.append(data_[i*2+1])
            train_data2.append(data_[i*2])
    if not gs_:
        return train_data1,train_data2
    else:
        gs_data=[]
        for i in range(len(gs_)):
            gs_data.append(gs_[i])
            if not test:
                gs_data.append(gs_[i])
        return train_data1,train_data2,gs_data

In [4]:
EYEAR="2016"
YEAR="2017"
ETRAIN_DIRS=[
    ("../english_testbed/data/"+EYEAR,None,False)]

ETEST_DIRS=[
    ("../english_testbed/data/"+EYEAR,None,False)]


TRAIN_DIRS=[
    ("../spanish_testbed/data/"+YEAR,None,True),
    ("../english_testbed/data/"+YEAR,None,False)]
TEST_DIRS=[
    ("../spanish_testbed/data/"+YEAR,None,True),
    ("../english_testbed/data/"+YEAR,None,False)]

GS_FILES={}

In [6]:
# LOADING EVALUATION TRAINING
etrain_data_,egs_data_=load_train_dirs(ETRAIN_DIRS)

print("Avg size:",np.mean([len(x.split()) for x in etrain_data_]))
print("Max size:",np.max([len(x.split()) for x in etrain_data_]))
print("Min size:",np.min([len(x.split()) for x in etrain_data_]))

etrain_data1,etrain_data2,egs_data=prepare_data_separated(etrain_data_,egs_data_)
    
print("Avg size after merging:",np.mean([len(x.split()) for x in etrain_data1]))
print("Max size after merging:",np.max([len(x.split()) for x in etrain_data1]))
print("Min size after merging:",np.min([len(x.split()) for x in etrain_data1]))
print("Total examples",len(etrain_data1))
print("Total labels",len(egs_data))


Starting load training ../english_testbed/data/2016
Phrases in STS.2013.test.input.headlines.txt 750 750
Phrases in STS.2012.train.input.SMTeuroparl.txt 734 734
Phrases in STS.2015.input.images.txt 1500 1500
Phrases in STS.2014.test.input.images.txt 750 750
Phrases in STS.2015.input.answers-students.txt 1500 1500
Phrases in STS.2015.input.headlines.txt 1500 1500
Phrases in STS.2012.test.input.MSRvid.txt 750 750
Phrases in STS.2012.test.input.MSRpar.txt 750 750
Phrases in STS.2012.test.input.SMTeuroparl.txt 459 459
Phrases in STS.2012.train.input.MSRpar.txt 750 750
Phrases in STS.2014.test.input.OnWN.txt 750 750
Phrases in STS.2012.train.input.MSRvid.txt 750 750
Phrases in STS.2012.test.input.surprise.SMTnews.txt 399 399
Phrases in STS.2014.test.input.deft-news.txt 300 300
Phrases in STS.2013.test.input.OnWN.txt 561 561
Phrases in STS.2014.test.input.headlines.txt 750 750
Phrases in STS.2013.test.input.FNWN.txt 189 189
Phrases in STS.2014.test.input.deft-forum.txt 450 450
Phrases in STS

In [7]:
# LOADING EVALUATION TESTING
etest_data_, egs_files=load_test_dirs(ETEST_DIRS,dir="test")
    
print("Total evaluating files",len(etest_data_))
print("Total candidates gs files",len(egs_files))

Starting loading test ../english_testbed/data/2016 False
Total phraes in STS.2016.input.postediting.txt 3287
Total phraes in STS.2016.input.question-question.txt 1555
Total phraes in STS.2016.input.answer-answer.txt 1572
Total phraes in STS.2016.input.plagiarism.txt 1271
Total phraes in STS.2016.input.headlines.txt 1498
Total test phrases ../english_testbed/data/2016 9183
Total evaluating files 5
Total candidates gs files 5


In [8]:
# LOADING TRAINING
train_data_,gs_data_=load_train_dirs(TRAIN_DIRS)

print("Avg size:",np.mean([len(x.split()) for x in train_data_]))
print("Max size:",np.max([len(x.split()) for x in train_data_]))
print("Min size:",np.min([len(x.split()) for x in train_data_]))

train_data1,train_data2,gs_data=prepare_data_separated(train_data_,gs_data_)
    
print("Avg size after merging:",np.mean([len(x.split()) for x in train_data1]))
print("Max size after merging:",np.max([len(x.split()) for x in train_data1]))
print("Min size after merging:",np.min([len(x.split()) for x in train_data1]))
print("Total examples",len(train_data1))
print("Total labels",len(gs_data))

Starting load training ../spanish_testbed/data/2017
Phrases in STS.2012.test.input.MSRvid.translation.txt 750 750
Phrases in STS.2014.test.input.news.translation.txt 480 480
Phrases in STS.2013.test.input.FNWN.translation.txt 189 189
Phrases in STS.2012.test.input.MSRpar.translation.txt 750 750
Phrases in STS.2013.test.input.headlines.translation.txt 619 750
Phrases in STS.2012.test.input.SMTeuroparl.translation.txt 459 459
Phrases in STS.2012.test.input.surprise.SMTnews.translation.txt 399 399
Phrases in STS.2012.train.input.SMTeuroparl.translation.txt 734 734
Phrases in STS.2014.input.local2.sentences.translation.txt 100 101
Phrases in STS.2012.train.input.MSRpar.translation.txt 750 750
Phrases in STS.2014.test.input.wikipedia.translation.txt 324 324
Phrases in STS.2015.train.input.wikipedia.translation.txt 251 251
Phrases in STS.2014.train.input.li65.translation.txt 65 65
Phrases in STS.2012.train.input.MSRvid.translation.txt 90 750
Phrases in STS.2014.input.local.sentences.translat

In [9]:
test_data_ =load_test_dirs(TEST_DIRS)
print("Total examples",len(test_data_))

Starting loading test ../spanish_testbed/data/2017 True
Total phraes in STS.2017.input.track3.es-es.translation.txt 250
Total test phrases ../spanish_testbed/data/2017 250
Starting loading test ../english_testbed/data/2017 False
Total phraes in STS.2017.input.track5.en-en.txt 250
Total test phrases ../english_testbed/data/2017 250
Total examples 2


In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

MAX_NB_WORDS=1000
MAX_SEQUENCE_LENGTH=40
VALIDATION_SPLIT=0.1
EMBEDDING_DIM=100

def prepare_keras_data(train_data1,train_data2,validation_split=VALIDATION_SPLIT,tokenizer=None,gs_data=None):
    print('Shape of data1 tensor:', len(train_data1))
    print('Shape of data2 tensor:', len(train_data2))
    if not tokenizer:
        tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,lower=False)
        tokenizer.fit_on_texts(train_data1+train_data2)
    sequences1 = tokenizer.texts_to_sequences(train_data1)
    sequences2 = tokenizer.texts_to_sequences(train_data2)
    
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
    data2 = pad_sequences(sequences2, maxlen=MAX_SEQUENCE_LENGTH)
    #data=np.zeros((data_.shape[0]/2,data_.shape[1]*2))
    #for i in range(data_.shape[0]/2):
    #    data[i,:data_.shape[1]]=data_[2*i]
    #    data[i,data_.shape[1]:]=data_[2*i+1]

    print('Shape of data1 tensor:', data1.shape)
    print('Shape of data2 tensor:', data2.shape)
    if gs_data:
        labels = to_categorical(np.asarray(gs_data))
        print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    indices_ = np.arange(int(data1.shape[0]/2))
    np.random.shuffle(indices_)
    indices=[]
    for i in indices_:
        indices.append(2*indices_[i])
        indices.append(2*indices_[i]+1)
    data1 = data1[indices]
    data2 = data2[indices]
    if gs_data:
        labels = labels[indices]
    if validation_split!= 0.0:
        nb_validation_samples = int(validation_split * data1.shape[0])

        x1_train = data1[:-nb_validation_samples]
        x2_train = data2[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x1_val = data1[-nb_validation_samples:]
        x2_val = data2[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]

        print('Shape of X1 train:',x1_train.shape)
        print('Shape of X2 train:',x2_train.shape)
        print('Shape of y train:',y_train.shape)
        print('Shape of X1 test :',x1_val.shape)
        print('Shape of X2 test :',x2_val.shape)
        print( 'Shape of y test :',y_val.shape)
    
        return x1_train,x2_train,y_train,x1_val,x2_val,y_val, word_index, tokenizer
    else:
        x1_train = data1
        x2_train = data2
        print('Shape of X1 train:'),x1_train.shape
        print('Shape of X2 train:'),x2_train.shape
        return x1_train,x2_train, word_index, tokenizer
        

In [18]:
x1_etrain,x2_etrain,y_etrain,x1_eval,x2_eval,y_eval, eword_index, etok = prepare_keras_data(etrain_data1,etrain_data2,gs_data=egs_data)

Shape of data1 tensor: 39684
Shape of data2 tensor: 39684
Found 29683 unique tokens.
Shape of data1 tensor: (39684, 40)
Shape of data2 tensor: (39684, 40)
Shape of label tensor: (39684, 6)
Shape of X1 train: (35716, 40)
Shape of X2 train: (35716, 40)
Shape of y train: (35716, 6)
Shape of X1 test : (3968, 40)
Shape of X2 test : (3968, 40)
Shape of y test : (3968, 6)


In [19]:
ematrix=create_embedding_matrix(eword_index,EMBEDDING_DIM,embeddings_index)
print("Embedding matrix evaluation", ematrix.shape)

Embedding matrix evaluation (29684, 100)


In [20]:
from keras.layers import Embedding,Merge
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM,Bidirectional


def create_model(word_index,matrix):
    embedding_layer1 = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            dropout=0.2,
                            trainable=False)
    embedding_layer2 = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            dropout=0.2,
                            trainable=False)

    sentence_A = Sequential()
    sentence_A.add(embedding_layer1)            
    sentence_A.add(Bidirectional(LSTM(output_dim=EMBEDDING_DIM,
                             dropout_W=0.2,
                             dropout_U=0.2)))
    #sentence_A.add(Dense(80))                  


    sentence_B = Sequential()
    sentence_B.add(embedding_layer2)              
    sentence_B.add(Bidirectional(LSTM(output_dim=EMBEDDING_DIM,
                             dropout_W=0.2,
                             dropout_U=0.2)))  
    #sentence_B.add(Dense(80))                      

    pair_sent = Merge([sentence_A, sentence_B], mode="concat")

    similarity = Sequential()
    similarity.add(pair_sent)
    similarity.add(Dense(6, activation='softmax')) 

    similarity.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    print(similarity.summary())

    
    #model = Sequential()
    #model.add(embedding_layer)
    #model.add(Conv1D(128,5,activation='relu'))
    #model.add(Conv1D(32,3,activation='relu'))
    #model.add(MaxPooling1D(5))
    #model.add(Dropout(0.2))
    #model.add(Bidirectional(LSTM(100, dropout_W=0.2, dropout_U=0.2)))
    #model.add(Dense(6, activation='softmax'))
    #model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    #print(model.summary())
    return similarity


In [21]:
# happy learning!
model = create_model(eword_index,ematrix)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 40, 100)       2968400                                      
____________________________________________________________________________________________________
bidirectional_3 (Bidirectional)  (None, 200)           160800                                       
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 40, 100)       2968400                                      
____________________________________________________________________________________________________
bidirectional_4 (Bidirectional)  (None, 200)           160800                                       
___________________________________________________________________________________________

In [None]:
model.fit([x1_etrain,x2_etrain], y_etrain, validation_data=([x1_eval,x2_eval], y_eval),
          nb_epoch=10, batch_size=1024)

Train on 35716 samples, validate on 3968 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [16]:
# EVALUATING
cmd=["perl",
    "../english_testbed/data/2015/evaluate/correlation-noconfidence.pl"]
from subprocess import Popen, PIPE, STDOUT


def eval_tmp(cmd,filename_gs,filename_sys): 
    cmd=cmd+[filename_gs,filename_sys]     
    p = Popen(cmd,  stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()
    res=stdout.decode("utf-8") 
    res=res.replace('Pearson: ','').strip()
    return float(res)

for dir,data in etest_data_.items():
    print("Evaluating",dir,len(data))
    data1,data2=prepare_data_separated (data,test=True)
    print("Size data1",len(data1))
    print("Size data2",len(data2))
    x1_eval,x2_eval, eword_index, etok = prepare_keras_data(data1,data2,tokenizer=etok, validation_split=0.0)
    print("Size data",len(data1))
    res=model.predict([x1_eval,x2_eval])
    res=[np.argmax(x) for x in res]
    print(res[:10])
    filename=os.path.join('.',dir)
    fn=open(filename,'w')
    for num in res:
        print( "{0:1.1f}".format(num),file=fn)
    fn.close()
    print(eval_tmp(cmd,egs_files[dir],filename))


Evaluating STS.2016.input.answer-answer.txt 3144
Size data1 1572
Size data2 1572
Shape of data1 tensor: 1572
Shape of data2 tensor: 1572
Found 29683 unique tokens.
Shape of data1 tensor: (1572, 20)
Shape of data2 tensor: (1572, 20)
Shape of X1 train:
Shape of X2 train:
Size data 1572
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0.01686
Evaluating STS.2016.input.headlines.txt 2996
Size data1 1498
Size data2 1498
Shape of data1 tensor: 1498
Shape of data2 tensor: 1498
Found 29683 unique tokens.
Shape of data1 tensor: (1498, 20)
Shape of data2 tensor: (1498, 20)
Shape of X1 train:
Shape of X2 train:
Size data 1498
[0, 0, 0, 4, 0, 0, 3, 0, 0, 4]
0.10432
Evaluating STS.2016.input.plagiarism.txt 2542
Size data1 1271
Size data2 1271
Shape of data1 tensor: 1271
Shape of data2 tensor: 1271
Found 29683 unique tokens.
Shape of data1 tensor: (1271, 20)
Shape of data2 tensor: (1271, 20)
Shape of X1 train:
Shape of X2 train:
Size data 1271
[0, 4, 0, 0, 0, 0, 0, 0, 4, 4]
0.02829
Evaluating STS.2016.input.question-