In [None]:
import os
from glob import glob
import numpy as np

import tensorflow as tf 
import tensorflow_hub as hub 
import tensorflow.compat.v1 as tf1

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, Input, Lambda

In [None]:
# Download ELMo

tf1.disable_eager_execution()
tf1.disable_v2_behavior()

In [None]:
file_path = 'GYAFC_Corpus.7z'
if not os.path.isfile(file_path):
    !wget -O GYAFC_Corpus.7z "https://docs.google.com/uc?export=download&id=18KvT3MHnKtlHcFyna0044CxNbdgOLJXU"
    !7z x GYAFC_Corpus.7z

In [None]:
def data_read(data_path):
    data = []    
    for file_name in glob(data_path):
        with open(file_name) as f:
            tmp_data = f.read().split('\n')
            data.extend(tmp_data)
    return data

In [None]:
path_formal = 'GYAFC_Corpus/*/{}/formal*'
path_inform = 'GYAFC_Corpus/*/{}/informal*'

In [None]:
data_train_form = data_read(path_formal.format('train'))
data_train_inform = data_read(path_inform.format('train'))

data_valid_form = data_read(path_formal.format('test'))
data_valid_inform = data_read(path_inform.format('test'))

data_test_form = data_read(path_formal.format('tune'))
data_test_inform = data_read(path_inform.format('tune'))

In [None]:
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [None]:
def prep_dataset(formal, informal):
    formal = list(set(formal))
    informal = list(set(informal))
    X = formal + informal
    Y_zeros = np.zeros(len(formal))
    Y_ones = np.ones(len(informal))
    y = np.concatenate((Y_zeros,Y_ones))
    return X, y

In [None]:
X_train, y_train = prep_dataset(data_train_form, data_train_inform)
X_valid, y_valid = prep_dataset(data_valid_form, data_valid_inform)
X_test, y_test = prep_dataset(data_test_form, data_test_inform)

In [None]:
from cachetools import TTLCache, cached
cache = TTLCache(maxsize=100, ttl=21600)

@cached(cache)
def ELMoEmbedding(x):
    return elmo(tf.reshape(tf.cast(x,tf.string),[-1]), signature='default', as_dict=True)['elmo']

In [None]:
def create_model_architecture():
    input_text = Input(shape=(1,), dtype='string', name='input_text')
    embedding = Lambda(ELMoEmbedding,
                       output_shape=(1024,),
                       name='elmo_embedding')(input_text)
    x = Bidirectional(LSTM(1024,
                           return_sequences=False,
                           dropout=0.2,
                           recurrent_dropout=0.2,
                           name="BiLSTM"))(embedding) 
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    # x = embedding
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(1, 'sigmoid')(x)

    model = Model(inputs = [input_text], outputs = predictions)
    
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    return model

In [None]:
model = create_model_architecture()

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_text (InputLayer)      [(None, 1)]               0         
_________________________________________________________________
elmo_embedding (Lambda)      (None, None, 1024)        0         
_________________________________________________________________
bidirectional (Bidirectional (None, 2048)              16785408  
_________________________________________________________________
dense (Dense)                (None, 512)               1049088   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0     

In [None]:
with tf.compat.v1.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    sess.run(tf.compat.v1.tables_initializer())
    history = model.fit(
        np.asarray(X_train), y_train,
        epochs=3, batch_size=64,
        validation_split=0.05
    )
    model.save_weights('model_elmo_weights.h5')

Train on 194403 samples, validate on 10232 samples
Epoch 1/4



Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
val_predict[100]

array([0.51989114], dtype=float32)

In [None]:
model.load_weights("model_elmo_weights.h5")

In [None]:
root_path = '/content/drive/MyDrive/Colab Notebooks/Informal to formal/'
name = 'model_elmo_weights'

In [None]:

from shutil import copyfile

In [None]:
!7z a {name} {'./' + name}
copyfile(name + '.7z', root_path + name + '.7z')

In [None]:
def save_on_disk(name, model, tokenizer, root_path):
    if not os.path.isdir(name):
        os.mkdir(name)
    filename = name +'/' + name
    with open(filename + '.json', 'w') as json_file:
        json_file.write(model.to_json())
    model.save_weights(filename + '.h5')
    tokenizer_json = tokenizer.to_json()
    filename = name +'/' + 'tokenizer.json'
    with io.open(filename, 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    !7z a {name} {'./' + name}
    copyfile(name + '.7z', root_path + name + '.7z')


def load_from_disk(root_path, name):
    if not os.path.isdir(name):
        copyfile(root_path + name +'.7z', name + '.7z')
        !7z x {name +'.7z'}
    filename = name +'/' + 'tokenizer.json'
    with open(filename) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    name += '/' + name
    json_file = open(name + '.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights(name + '.h5')
    return model, tokenizer

In [None]:
 with tf.compat.v1.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    sess.run(tf.compat.v1.tables_initializer())
    model.load_weights("model_elmo_weights.h5")
    val_predict = model.predict(np.asarray(X_valid))
    test_predict = model.predict(np.asarray(X_test))

In [None]:
dict(zip(model.metrics_names, result))

{'acc': 0.8682798, 'loss': 0.45343654884963286}

In [None]:
from sklearn.metrics import classification_report

In [None]:
val_predict_around = np.around(val_predict)

In [None]:
result = classification_report(np.expand_dims(y_valid, 1), val_predict_around, digits=6)
print(result)

              precision    recall  f1-score   support

         0.0   0.831293  0.950093  0.886732     10720
         1.0   0.928686  0.771198  0.842646      9034

    accuracy                       0.868280     19754
   macro avg   0.879989  0.860645  0.864689     19754
weighted avg   0.875833  0.868280  0.866570     19754



In [None]:
test_predict_around = np.around(test_predict)

In [None]:
result = classification_report(np.expand_dims(y_test, 1), test_predict_around, digits=6)
print(result)

              precision    recall  f1-score   support

         0.0   0.796025  0.956481  0.868907     22151
         1.0   0.935686  0.720931  0.814389     19454

    accuracy                       0.846341     41605
   macro avg   0.865856  0.838706  0.841648     41605
weighted avg   0.861329  0.846341  0.843415     41605

