# Training using ELMO on Sentence Level

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## General Information & Label Map

### Directories

In [2]:
classifier_name = "test_elmo" # Name of the model is at the same time the dir where model files will be stored
classifier_dir = os.path.abspath(classifier_name)
# label_map = "" # path to label map (not implemented yet..)
data_dir = "../data/datasets/daniel_0212" # Where text data are stored
test_dir = "../data/eval_data"

In [3]:
if not os.path.isdir(classifier_dir):
    os.mkdir(classifier_dir)
    print("model dir created!")
else:
    print("model dir already exists")

model dir created!


### Model (Classifier) & Training Parameters

In [4]:
# Training Parameters
training_steps = 1
batch_size = 10

# Model Parameters
num_classes = 4 # ADJUST THAT
hidden_units = [1024,512,256,64] # Iterable of number hidden units per layer. 
activation_fn=tf.nn.relu
optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.001)
dropout = None # None or float 0-1
batch_normalization = False
regularization = False

### Label Map

In [5]:
# Use the label_map class from you package..
def get_label_id(class_name:str):
    if class_name == "clustering":   
        return 0
    if class_name == "association":
        return 1
    if class_name == "regression":
        return 2
    if class_name == "classification":
        return 3

## Environment for TF-HUB Model

In [6]:
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

In [7]:
# I dont know if he overwrites the model during retraining.
embedding_model_url = "https://tfhub.dev/google/elmo/2" 

In [8]:
import hashlib
# The path where tf-hub will cache the model (use an absolute path..) 
os.environ["TFHUB_CACHE_DIR"] = os.path.join(classifier_dir, "embedding_model")

#TF-hub will store the name as hex
embedding_model_hash = hashlib.sha1(classifier_dir.encode("utf8")).hexdigest()
print(embedding_model_hash)

c95bf79028a6eec3f0bfd61ddb866cb183cde303


## Loading Data into Dataframe

In [9]:
from nltk.tokenize import sent_tokenize

In [10]:
def read_data(_dir:str, sentence_level = True):
    data = {}
    data["text"] = []
    data["class"] = []
    for root, dirs, files in os.walk(_dir):
        for _dir in dirs: 
            for txt_file in [x for x in os.listdir(os.path.join(root, _dir)) if x.endswith((".txt", ".TXT"))]:
                # Class name = dir name
                class_name = _dir
                #Read File
                file_name = os.path.join(root, _dir, txt_file)
                file = open(file_name, "r")
                txt = file.read()
                file.close()
                if sentence_level:
                    # Txt to List[Sentences]
                    sentences = sent_tokenize(txt)
                    # Abstracts
                    for sentence in sentences:
                        data["text"].append(sentence)
                        data["class"].append(get_label_id(class_name))
                else:
                    data["text"].append(sentence)
                    data["class"].append(get_label_id(class_name))
    df = pd.DataFrame.from_dict(data)
    del data
    return df

In [11]:
%%time
df = read_data(data_dir)

CPU times: user 3.19 s, sys: 780 ms, total: 3.97 s
Wall time: 4.91 s


## Data Overview

In [12]:
df.sample(frac=1).head()

Unnamed: 0,text,class
5527,Building on the material of Part\none of this ...,3
13058,Recently the deep learning techniques have ach...,3
17019,A 2D laser\ntriangulation scanner was selected...,1
33132,"In this paper, we reveal that the problem is i...",0
16876,This\napproach can be applied progressively to...,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37515 entries, 0 to 37514
Data columns (total 2 columns):
text     37515 non-null object
class    37515 non-null int64
dtypes: int64(1), object(1)
memory usage: 586.2+ KB


## Train and Test Split

In [14]:
df = shuffle(df) # Shuffle the DataFrame
X, Y = train_test_split(df, test_size=0.2, random_state = 101)

In [15]:
X.head(3)

Unnamed: 0,text,class
611,We present an algorithm for randomized model g...,3
25642,(ii) New types of\ndata are added and trained on.,1
25136,"At the same time, the\nlocality-aware query op...",1


In [16]:
Y.head(3)

Unnamed: 0,text,class
19176,We exploit the effective application\nlayer se...,1
33039,While these methods have been applied to both ...,0
28632,In comparison with\nperfect secret sharing it ...,1


In [17]:
print("Train Data: \n")
print(X.info())
print("\n Test Data: \n")
print(Y.info())

Train Data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30012 entries, 611 to 18425
Data columns (total 2 columns):
text     30012 non-null object
class    30012 non-null int64
dtypes: int64(1), object(1)
memory usage: 703.4+ KB
None

 Test Data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7503 entries, 19176 to 5319
Data columns (total 2 columns):
text     7503 non-null object
class    7503 non-null int64
dtypes: int64(1), object(1)
memory usage: 175.9+ KB
None


## Training

### Constructing Feature Columns

In [18]:
%%time
tf.logging.set_verbosity(tf.logging.INFO)
embedded_text_feature_column = hub.text_embedding_column(
    key="text", 
    module_spec=embedding_model_url,
    trainable = True)

INFO:tensorflow:Using /Users/Daniel/PycharmProjects/Recommender-System/notebooks/test_elmo/embedding_model to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/elmo/2'.
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 20.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 40.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 60.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 80.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 100.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 110.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 130.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 150.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 170.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 190.35MB
INFO:tensorflow:Downloading https://tfhub.dev/google/elmo/2: 210.35MB
INFO:tensorflow:Downloading http

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR) # Reduce the stupid tf-warnings

In [None]:
estimator = tf.estimator.DNNClassifier(
    hidden_units= hidden_units,
    feature_columns=[embedded_text_feature_column],
    model_dir = classifier_dir,
    activation_fn=activation_fn,
    n_classes=num_classes,
    dropout = dropout,
    batch_norm=batch_normalization,
    optimizer=optimizer)

### Defining Input Functions

In [None]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    X, 
    X["class"],
    num_epochs=None,
    batch_size= batch_size,
    shuffle = False)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    X, X["class"], shuffle=False)

# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    Y, Y["class"], shuffle=False)

### Training of the Classifier

In [None]:
%%time
tf.logging.set_verbosity(tf.logging.INFO)
estimator.train(input_fn=train_input_fn, steps=training_steps);

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /Users/Daniel/PycharmProjects/Recommender-System/notebooks/test_elmo/model.ckpt.
INFO:tensorflow:loss = 13.454466, step = 1
INFO:tensorflow:Saving checkpoints for 1 into /Users/Daniel/PycharmProjects/Recommender-System/notebooks/test_elmo/model.ckpt.
INFO:tensorflow:Loss for final step: 13

## Evaluation

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
def get_predictions(estimator, input_fn):
    return [x["class_ids"][0] for x in estimator.predict(input_fn=input_fn)]

In [None]:
labels = ["clustering", "association", "regression", "classification"]

### Insampling

In [None]:
results = estimator.evaluate(input_fn=predict_train_input_fn)

print("Results: \n")
print(results)

In [None]:
with tf.Graph().as_default():
    cm = tf.confusion_matrix(X["class"], get_predictions(estimator, predict_train_input_fn))
    with tf.Session() as session:
        cm_out = session.run(cm)

# Normalize the confusion matrix so that each row sums to 1.
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_out, annot=True, xticklabels=labels, yticklabels=labels);
plt.xlabel("Predicted");
plt.ylabel("True");

### Outsampling

In [None]:
results = estimator.evaluate(input_fn=predict_test_input_fn)

print("Results: \n")
print(results)

In [None]:
with tf.Graph().as_default():
    cm = tf.confusion_matrix(Y["class"], get_predictions(estimator, predict_test_input_fn))
    with tf.Session() as session:
        cm_out = session.run(cm)

# Normalize the confusion matrix so that each row sums to 1.
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_out, annot=True, xticklabels=labels, yticklabels=labels);
plt.xlabel("Predicted");
plt.ylabel("True");

### Test Set

#### Read Test Data

In [None]:
df_test = read_data(test_dir, sentence_level = False)

In [None]:
shuffle(df_test.head(5))

#### Defining Input Function

In [None]:
# Prediction on the whole training set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    df_test, df_test["class"], shuffle=True)

#### Evaluation of Test Set

In [None]:
results = estimator.evaluate(input_fn=predict_test_input_fn)

print("Results: \n")
print(results)

In [None]:
with tf.Graph().as_default():
    cm = tf.confusion_matrix(df_test["class"], get_predictions(estimator, predict_test_input_fn))
    with tf.Session() as session:
        cm_out = session.run(cm)

# Normalize the confusion matrix so that each row sums to 1.
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_out, annot=True, xticklabels=labels, yticklabels=labels);
plt.xlabel("Predicted");
plt.ylabel("True");

### Semantic Textual Similarity

#### Function and Sentences for Similarity Matrix

In [None]:
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")


def run_and_plot(session_, input_tensor_, messages_, encoding_tensor):
    message_embeddings_ = session_.run(
      encoding_tensor, feed_dict={input_tensor_: messages_})
    plot_similarity(messages_, message_embeddings_, 90)

In [None]:
clustering_0 = "finding similar groups"
clustering_1 = "clustering things"
pattern_mining_0 = "analysing sequences"
pattern_mining_1 = "finding pattern"
classification_0 = "predicting categorical types"
classification_1 = "classifing objects"
regression_0 = "predicting prices"
regression_1 = "forecasting numbers"

messages = [clustering_0,
            clustering_1,
            pattern_mining_0,
            pattern_mining_1,
            classification_0,
            classification_1,
            regression_0,
            regression_1]

#### Similarity Matrix

In [None]:
%%time
# Initial download takes a while till the model is downloaded from tf-hub (~1GB)
tf.logging.set_verbosity(tf.logging.INFO)
model = hub.Module(embedding_model_url, trainable = False) # trainable = True for Transfer Learning!!

In [None]:
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = model(similarity_input_placeholder)
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    
    run_and_plot(session, similarity_input_placeholder, messages,
               similarity_message_encodings)

## Inference for each Test Data File

In [None]:
df_inference = pd.DataFrame.from_dict({"text": df_test["text"]})
inference_func = tf.estimator.inputs.pandas_input_fn(df_inference, shuffle=False)

In [None]:
results = estimator.predict(inference_func)
y = []
probs =[]
for x in results:
    y.append(x["class_ids"])
    probs.append(x["probabilities"])
df_test["predicted"] = y
df_test["probs"] = probs

In [None]:
pd.set_option('display.max_colwidth', -1)
df_test