In [1]:
from functions import *
from twitterTokenizer import Tokenizer
import numpy as np, random
import subprocess 
np.random.seed(1337)  # for reproducibility
from keras.layers.normalization  import BatchNormalization
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional, LSTM, Input, merge

Using Theano backend.


In [2]:
X_train, y_train = load_SemEval_from_file('./data/subtaskCE.train_dev.tsv')
X_dev, y_dev = load_SemEval_from_file('./data/subtaskCE.devtest.tsv')
X_test, y_test = load_SemEval_SubTaskCE_Test('./data/SemEval2016-task4-test.subtask-BCDE.txt', './data/SemEval2016_task4_subtaskC_test_gold.txt')
X_train_ternary, y_train_ternary = load_SemEval_subtaskA('./data/subtaskA.downloaded.tsv')

In [3]:
X_train_additional = load_sparse_csr('./additional_features/X_train_additional.npz', )
X_dev_additional = load_sparse_csr('./additional_features/X_dev_additional.npz', )
X_test_additional = load_sparse_csr('./additional_features/X_test_additional.npz',)
X_ternary_additional = load_sparse_csr('./additional_features/X_ternary_additional.npz',)
X_train_additional.shape, X_dev_additional.shape,  X_test_additional.shape, X_ternary_additional.shape

((7292, 1368), (1778, 1368), (20632, 1368), (5500, 1368))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
MAX_FEATURES, MAX_LEN, BATCH_SIZE  = 11000, 30, 64

tokenizer = Tokenizer(preserve_case=False)

vec = CountVectorizer( ngram_range=(1,1), analyzer='word', tokenizer=tokenizer.tokenize, stop_words=None)
vec.fit(X_train+X_train_ternary)

x_train = vec.transform(X_train)
x_train_ternary = vec.transform(X_train_ternary)
x_dev = vec.transform(X_dev)
x_test = vec.transform(X_test)

print("Train shape", x_train.shape, "Dev shape", x_dev.shape, "Test shape", x_test.shape, "%d vocabulary terms found"%len(vec.vocabulary_))

('Train shape', (7292, 14356), 'Dev shape', (1778, 14356), 'Test shape', (20632, 14356), '14356 vocabulary terms found')


In [5]:
x_train_nn = np.split(x_train.indices, x_train.indptr[1:-1])
x_train_ternary_nn = np.split(x_train_ternary.indices, x_train_ternary.indptr[1:-1])
x_dev_nn = np.split(x_dev.indices, x_dev.indptr[1:-1])
x_test_nn = np.split(x_test.indices, x_test.indptr[1:-1])

In [6]:
print('Pad sequences (samples x time)')
x_train_nn = sequence.pad_sequences(x_train_nn, maxlen=MAX_LEN)
x_train_ternary_nn = sequence.pad_sequences(x_train_ternary_nn, maxlen=MAX_LEN)
x_dev_nn = sequence.pad_sequences(x_dev_nn, maxlen=MAX_LEN)
x_test_nn = sequence.pad_sequences(x_test_nn, maxlen=MAX_LEN)
print('X_train shape:', x_train_nn.shape)
print('X_ternary shape:', x_train_ternary_nn.shape)
print('X_dev shape:', x_dev_nn.shape)
print('X_test shape:', x_test_nn.shape)

Pad sequences (samples x time)
('X_train shape:', (7292, 30))
('X_ternary shape:', (5500, 30))
('X_dev shape:', (1778, 30))
('X_test shape:', (20632, 30))


In [7]:
import os, sys
EMBEDDING_DIM = 50

embeddings_index = {}
f = open(os.path.join("./data/", 'glove.twitter.27B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(vec.vocabulary_) + 1, EMBEDDING_DIM))
for key,val in vec.vocabulary_.iteritems():
    embedding_vector = embeddings_index.get(key)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[val] = embedding_vector
print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=[-2, -1, 0 , 1, 2])
y_train_nn = mlb.fit_transform([[y] for y in y_train])
y_test_nn = mlb.transform([[y] for y in y_test])

mlb2 = MultiLabelBinarizer(classes=[-1, 0, 1])
y_train_nn_ternary = mlb2.fit_transform([[y] for y in y_train_ternary])
# y_test_nn = mlb.transform([[y] for y in y_train_task2])

In [9]:
from sklearn import utils 
class_weights = utils.compute_class_weight('balanced', [-2, -1, 0, 1, 2], y_train)
class_weights= {class_id:class_weight for class_id, class_weight in zip(range(5), class_weights)}

In [12]:
print('Build models...')


main_input = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')

x = Embedding(input_dim = len(vec.vocabulary_)+1, output_dim = EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=True, dropout=0.3)(main_input)
x = BatchNormalization()(x)

lstm_out = Bidirectional(LSTM(output_dim = 50, input_dim = EMBEDDING_DIM, dropout_W=0.3, dropout_U=0.3) )(x)


auxiliary_input = Input(shape=(1368,), name='aux_input')
t_auxiliary_input = Dense(256, activation='tanh')(auxiliary_input)
t_auxiliary_input = Dropout(0.5)(t_auxiliary_input)

x = merge([lstm_out, t_auxiliary_input], mode='concat')


x = Dense(30, activation='tanh', )(x)
x = Dropout(0.5)(x)

task1_output = Dense(5, activation='softmax', name='main_output')(x)
task2_output = Dense(3, activation='softmax', name='aux_output')(x)


model_task1 = Model(input=[main_input, auxiliary_input], output=[task1_output])
model_task2 = Model(input=[main_input, auxiliary_input], output=[task2_output])

model_task1.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])
model_task2.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])
#model_task1.summary()
#model_task2.summary()

Build models...


In [13]:
BATCH_SIZE = 128
results = []
for batch in range(600*5):
    nb_rand = 
    if random.random() < 1.0:
        sample = np.random.randint(0, len(x_train_nn), BATCH_SIZE)
        x_sampled, y_sampled, x_aux = x_train_nn[sample], y_train_nn[sample], X_train_additional[sample]
        model_task1.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense() }, [y_sampled], class_weight=class_weights, sample_weight=None)
    else:
        sample = np.random.randint(0, len(x_train_ternary_nn), BATCH_SIZE)
        x_sampled, y_sampled, x_aux = x_train_ternary_nn[sample], y_train_nn_ternary[sample], X_ternary_additional[sample]
        model_task2.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense()}, [y_sampled], class_weight=None, sample_weight=None)
        
    if batch%57==0:
        dev_preds = np.argmax(model_task1.predict({'main_input': x_dev_nn,'aux_input': X_dev_additional.todense() }, batch_size=BATCH_SIZE, verbose=0), axis=1)
        test_preds = np.argmax(model_task1.predict({'main_input': x_test_nn, 'aux_input': X_test_additional.todense()}, batch_size=BATCH_SIZE, verbose=0), axis=1)
        results.append([macroMAE(y_dev, dev_preds-2), macroMAE(y_test, test_preds-2)])
        print "Iteration:", int(batch/57)+1, "\tDEV:", results[-1][0], "\tTEST:", results[-1][1]
        
#68 0.775022887016 0.778202313573 <- without extra features best
#probably need to do some cross-val or increase the size of the validation set.. Increased dropout, helped. !Success!!!
best_run = np.argmin(np.asarray(results)[:,0])
print results[best_run]

Iteration: 1 	DEV: 1.92295925391 	TEST: 1.93709559081
Iteration: 2 	DEV: 1.18776405748 	TEST: 1.05533295264
Iteration: 3 	DEV: 0.895966946297 	TEST: 0.816812569005
Iteration: 4 	DEV: 1.02211077479 	TEST: 0.892998712215
Iteration: 5 	DEV: 0.954160935923 	TEST: 0.826061924628
Iteration: 6 	DEV: 0.924547925944 	TEST: 0.786129369759
Iteration: 7 	DEV: 0.854076047407 	TEST: 0.735604932049
Iteration: 8 	DEV: 0.799235732629 	TEST: 0.657763888703
Iteration: 9 	DEV: 0.833436881142 	TEST: 0.726908912671
Iteration: 10 	DEV: 0.784366450381 	TEST: 0.682021796296
Iteration: 11 	DEV: 0.755167897014 	TEST: 0.665676846315
Iteration: 12 	DEV: 0.81856971338 	TEST: 0.71937841112
Iteration: 13 	DEV: 0.762962818998 	TEST: 0.658635105364
Iteration: 14 	DEV: 0.773821181079 	TEST: 0.708637987154
Iteration: 15 	DEV: 0.775601371349 	TEST: 0.665637346606
Iteration: 16 	DEV: 0.759631361236 	TEST: 0.703419706797
Iteration: 17 	DEV: 0.787738085984 	TEST: 0.751457377915
Iteration: 18 	DEV: 0.804429359786 	TEST: 0.747