In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import os
import keras.backend as K

from elmo.lm_generator import LMDataGenerator, MTLLMDataGenerator
from elmo.model_girnet import ELMo

Using TensorFlow backend.


## Note
- fasttext: sent_vector: unsup: [0.5040783036202435, 0.44986425566245447]
- fasttext: sent_vector: sup: [0.5628058728541675, 0.5187983749157842]
- fasttext: lstm: unsup [0.6182707995419401, 0.5768815131210775] 
- fasttext: lstm: sup [0.6705, 0.6624]
- MultiBPEmb: lstm: unsup: 0.52
- MultiBPEmb: lstm: sup: 0.64

### cm_p_test_1000.txt
- vanilla :  46.75967671612614
- mtl: 39.42567465448776
- girnet: 165.677334

###  cm_main_test_1000.txt
- vanilla :  73.61951568054178
- mtl: 58.723475872121696
- girnet: 132.512263

Conclusion: SUBWORDS RULE!

In [3]:
DATA_SET_DIR = '../twiter_scrapping/data/'

In [4]:
parameters = {
    'name': 'cm_girnet',
    'multi_processing': True,
    'n_threads': 10,
    'cuDNN': True if len(K.tensorflow_backend._get_available_gpus()) else False,
    'train_dataset': 'cm_p_train_small.txt',
    'valid_dataset': 'cm_p_test_1000.txt',
    'test_dataset': 'cm_test_main_1000.txt',
    'vocab': 'vocab',
    'vocab_size': 525133,
    'num_sampled': 8000,
    'charset_size': 262,
    'sentence_maxlen': 32,
    'token_maxlen': 50,
    'token_encoding': 'word',
    'epochs': 1,
    'patience': 2,
    'batch_size': 32,
    'clip_value': 1,
    'cell_clip': 5,
    'proj_clip': 5,
    'lr': 0.2,
    'shuffle': True,
    'n_lstm_layers': 1,
    'n_highway_layers': 1,
    'cnn_filters': [[1, 32],
                    [2, 32],
                    [3, 64],
                    [4, 128],
                    [5, 256],
                    [6, 512],
                    [7, 512]
                    ],
    'lstm_units_size': 400,
    'hidden_units_size': 200,
    'char_embedding_size': 16,
    'dropout_rate': 0.1,
    'word_dropout_rate': 0.05,
    'weight_tying': True,
    'unidirectional': True,
}



In [5]:
train_generator = MTLLMDataGenerator([os.path.join(DATA_SET_DIR, 'en_p_train_small.txt'),os.path.join(DATA_SET_DIR, 'es_p_train_small.txt'),os.path.join(DATA_SET_DIR, 'cm_p_train_small.txt')],
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding']
)
val_generator = MTLLMDataGenerator([os.path.join(DATA_SET_DIR, 'en_p_test_1000.txt'),os.path.join(DATA_SET_DIR, 'es_p_test_1000.txt'),os.path.join(DATA_SET_DIR, 'cm_p_test_1000.txt')],
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding']
)
test_generator = MTLLMDataGenerator([os.path.join(DATA_SET_DIR, parameters['test_dataset']),os.path.join(DATA_SET_DIR, parameters['test_dataset']),os.path.join(DATA_SET_DIR, parameters['test_dataset'])],
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding']
)

In [45]:
# train_en_generator =  LMDataGenerator(os.path.join(DATA_SET_DIR, 'en_p_train_small.txt'),
#                                   os.path.join(DATA_SET_DIR, parameters['vocab']),
#                                   sentence_maxlen=parameters['sentence_maxlen'],
#                                   token_maxlen=parameters['token_maxlen'],
#                                   batch_size=parameters['batch_size'],
#                                   shuffle=parameters['shuffle'],
#                                   token_encoding=parameters['token_encoding'])

# train_es_generator =  LMDataGenerator(os.path.join(DATA_SET_DIR, 'es_p_train_small.txt'),
#                                   os.path.join(DATA_SET_DIR, parameters['vocab']),
#                                   sentence_maxlen=parameters['sentence_maxlen'],
#                                   token_maxlen=parameters['token_maxlen'],
#                                   batch_size=parameters['batch_size'],
#                                   shuffle=parameters['shuffle'],
#                                   token_encoding=parameters['token_encoding'])

# train_cm_generator =  LMDataGenerator(os.path.join(DATA_SET_DIR, 'cm_p_train_small.txt'),
#                                   os.path.join(DATA_SET_DIR, parameters['vocab']),
#                                   sentence_maxlen=parameters['sentence_maxlen'],
#                                   token_maxlen=parameters['token_maxlen'],
#                                   batch_size=parameters['batch_size'],
#                                   shuffle=parameters['shuffle'],
#                                   token_encoding=parameters['token_encoding'])

# test_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['test_dataset']),
#                                 os.path.join(DATA_SET_DIR, parameters['vocab']),
#                                 sentence_maxlen=parameters['sentence_maxlen'],
#                                 token_maxlen=parameters['token_maxlen'],
#                                 batch_size=parameters['batch_size'],
#                                 shuffle=parameters['shuffle'],
#                                 token_encoding=parameters['token_encoding'])
# val_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['valid_dataset']),
#                                 os.path.join(DATA_SET_DIR, parameters['vocab']),
#                                 sentence_maxlen=parameters['sentence_maxlen'],
#                                 token_maxlen=parameters['token_maxlen'],
#                                 batch_size=parameters['batch_size'],
#                                 shuffle=parameters['shuffle'],
#                                 token_encoding=parameters['token_encoding'])

In [18]:
# Set-up Generators
train_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['train_dataset']),
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding'])

val_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['valid_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

test_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['test_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

In [6]:
# Compile ELMo
elmo_model = ELMo(parameters)
elmo_model.compile_elmo(print_summary=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [None]:
# for tr in [train_en_generator, train_es_generator, train_cm_generator]:
#     elmo_model.train(train_data=tr, valid_data=val_generator)
#  5.5218
elmo_model.train(train_data=train_generator, valid_data=val_generator, resume=True)

Epoch 1/1
 4246/62500 [=>............................] - ETA: 12:27:07 - loss: 5.6353

In [9]:
elmo_model.load_temp()

In [None]:
elmo_model.save(sampled_softmax=False, temp=False)

In [None]:
# cm_p_test_1000.txt
# vanilla :  46.75967671612614
# mtl: 39.42567465448776
# girnet: 165.677334
# cm_main_test_1000.txt
# vanilla :  73.61951568054178
# mtl: 58.723475872121696
# girnet: 132.512263
elmo_model.evaluate(test_generator)

sup


In [None]:
# Build ELMo meta-model to deploy for production and persist in disk
# elmo_model.wrap_multi_elmo_encoder(print_summary=True, save=True)

In [None]:
# elmo_model.load()

In [None]:
# Get ELMo embeddings to feed as inputs for downstream tasks
# elmo_embeddings = elmo_model.get_outputs(test_generator, output_type='word', state='mean')