In [7]:
#Import libraries, prepare verbose and set options
import time
import os
import random
import re
import codecs
import numpy as np
from collections import Counter

def verbose(*args):
    print " ".join([str(a) for a in args])

class Opts:
    verbose=False
    filter_test=".*"
    
opts=Opts()

In [8]:
def load_phrases_from_file(dirname,filename,format='2017',translation=False):
    re_file=re.compile('.*\.input\..*\.txt$')                                        
    re_file_translation=re.compile('.*\.input\..*\.translation.txt$')  
    if translation:                                                              
        re_file=re_file_translation

    phrases=[]                                                                   
    if not re_file.match(filename):                                              
        return []                                                                
                                                                                 
    with codecs.open(os.path.join(dirname,filename),encoding='utf-8') as data:
        for line in data:
            bits=line.strip().split('\t')
            if len(bits)>=2 or len(bits)<=4:                                     
                if not format:                                                   
                    phrases.append((bits[0],bits[1]))                            
                elif format=="2017":                                             
                    phrases.append((bits[2],bits[3]))                            
    return phrases 

def load_gs_from_file(dirname,filename):
    re_gs=re.compile('.*\.gs\..*\.(txt|ascii)$')
    gs=[]
    if not re_gs.match(filename):
        return []

    with open(os.path.join(dirname,filename)) as data:
        for line in data:
            line=line.strip()
            try:
                gs.append(float(line))
            except ValueError:
                gs.append(0.0)
    return gs

def load_all_phrases(dirname,filter=".",format=None,translation=False):          
    all_phrases=[]                                                               
    filter_dirs=re.compile(filter)                                               
    for filename in os.listdir(dirname):                                         
        if not filter_dirs.search(filename):                                     
            continue                                                             
        phrases=load_phrases_from_file(dirname,filename,format=format,translation=translation)
        if len(phrases)>0:                                                       
            all_phrases.append((filename,phrases))                               
    return all_phrases

def load_all_gs(dirname):
    all_gs=[]
    for filename in os.listdir(dirname):
        gs=load_gs_from_file(dirname,filename)
        if len(gs)>0:
            all_gs.append((filename,gs))
    return all_gs

In [9]:
YEAR="2017"
MAX_NB_WORDS=5000
MAX_SEQUENCE_LENGTH=30
VALIDATION_SPLIT=0.30
GLOVE_DIR='.'
EMBEDDING_DIM=100
TRAIN_DIRS=[
    ("../spanish_testbed/data/"+YEAR,None,True)]

In [10]:
def load_train_dirs(dirs):
    train_data=[]
    gs_data=[]
    for directory,format,translation in dirs: 
        verbose('Starting training')
        train_data_=load_all_phrases(os.path.join(directory,'train'),format=format,translation=True)
        gs_data_=dict(load_all_gs(os.path.join(directory,'train')))

        for (n,d) in train_data_:
            n_=n.replace('input', 'gs')
            if translation:
                n_=n_.replace('.translation', '')
            for i,s in enumerate(d):
                train_data.append(s[0].encode('utf-8'))
                train_data.append(s[1].encode('utf-8'))
                gs_data.append(gs_data_[n_][i])
            verbose("Phrases in",n,len(d),len(gs_data_[n_]))
        verbose('Total train phrases',directory,sum([len(d) for n,d in train_data_]))
        
        
        verbose('Total train phrases',len(train_data))
    return train_data,gs_data
    
train_data,gs_data=load_train_dirs(TRAIN_DIRS)

Starting training
Phrases in STS.2013.test.input.headlines.translation.txt 619 750
Phrases in STS.2014.train.input.li65.translation.txt 65 65
Phrases in STS.2012.test.input.SMTeuroparl.translation.txt 459 459
Phrases in STS.2014.test.input.news.translation.txt 480 480
Phrases in STS.2013.test.input.OnWN.translation.txt 212 561
Phrases in STS.2014.input.local2.sentences.translation.txt 100 101
Phrases in STS.2014.input.local.pairs-journals.translation.txt 106 133
Phrases in STS.2012.train.input.SMTeuroparl.translation.txt 734 734
Phrases in STS.2015.train.input.wikipedia.translation.txt 251 251
Phrases in STS.2012.train.input.MSRvid.translation.txt 90 750
Phrases in STS.2012.test.input.surprise.SMTnews.translation.txt 399 399
Phrases in STS.2012.train.input.MSRpar.translation.txt 750 750
Phrases in STS.2014.input.local.sentences.translation.txt 101 101
Phrases in STS.2014.test.input.wikipedia.translation.txt 324 324
Phrases in STS.2013.test.input.FNWN.translation.txt 189 189
Phrases in 

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data)
sequences = tokenizer.texts_to_sequences(train_data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_ = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data=np.zeros((data_.shape[0]/2,data_.shape[1]*2))

for i in range(data_.shape[0]/2):
    data[i,:data_.shape[1]]=data_[2*i]
    data[i,data_.shape[1]:]=data_[2*i+1]

labels = to_categorical(np.asarray(gs_data))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Shape of train:',x_train.shape)
print('Shape of train:',y_train.shape)
print('Shape of train:',x_val.shape)
print('Shape of train:',y_val.shape)

Found 16065 unique tokens.
('Shape of data tensor:', (6879, 60))
('Shape of label tensor:', (6879, 6))
('Shape of train:', (4816, 60))
('Shape of train:', (4816, 6))
('Shape of train:', (2063, 60))
('Shape of train:', (2063, 6))


In [12]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print embedding_matrix

[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [-0.038194   -0.24487001  0.72812003 ..., -0.1459      0.82779998
   0.27061999]
 [-0.1529     -0.24279     0.89837003 ..., -0.59100002  1.00390005
   0.20664001]
 ..., 
 [-0.010992    0.54471999 -0.18803    ..., -0.49559999 -0.065065   -0.036046  ]
 [-0.66437    -0.50608999 -0.28353    ..., -0.84451002 -0.17738999
   0.25242001]
 [-0.11792     0.53746003  0.90812999 ..., -0.025289    0.45811999
   0.74392998]]


In [14]:
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH*2,
                             dropout=0.2,
                            trainable=False)

#sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH*2,), dtype='int32')
#embedded_sequences = embedding_layer(sequence_input)
#x = Conv1D(128, 5, activation='relu')(embedded_sequences)
#x = MaxPooling1D(5)(x)
#x = Conv1D(96, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = Conv1D(86, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)  # global max pooling
#x = Flatten()(x)
#x = Dense(64, activation='relu')(x)
#preds = Dense(6, activation='softmax')(x)
#model = Model(sequence_input, preds)
#model.compile(loss='categorical_crossentropy',
#              optimizer='rmsprop',
#              metrics=['acc'])


model = Sequential()
model.add(embedding_layer)
#model.add(Conv1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
#model.add(MaxPooling1D(pool_length=2))
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(200))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print(model.summary())

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=30, batch_size=128)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 60, 100)       1606600     embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           80400       embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 200)           20200       lstm_1[0][0]                     
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 6)             1206        dense_1[0][0]                    
Total params: 1,708,406
Trainable params: 101,806
Non-trainable params: 1,606,600
_________

<keras.callbacks.History at 0x7fbc82655190>