In [1]:
# docker run --gpus all -it -v $(realpath ~/):/tf/All -v /home/rob/Data2:/home/rob/Data2 --env HF_DATASETS_CACHE=/home/rob/Data2/huggingface/datasets --env TRANSFORMERS_CACHE=/home/rob/Data2/huggingface/transformers -p 8888:8888 -p 6006:6006 d139afc9cfb2

# This generates the 'elmo_embeddings.pkl' file into 
# the elmo_embeddings folder.

# 2nd Pass ... 
# Run Date: Thursday, January 19, 2023
# Run Time: 00:07:17

# First pass ... 
# Run Date: Thursday, January 19, 2023
# Run Time: 00:12:18


In [2]:
import time
from datetime import date

startTime = time.time()
todaysDate = date.today()

In [3]:
# only target the 2070 Super ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Using ELMo Embeddings

<table align="left">
    <td>
        <a target="_blank" href="https://colab.research.google.com/github/thushv89/packt_nlp_tensorflow_2/blob/master/Ch04-Advance-Word-Vectors/ch4_elmo_embeddings.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
    </td>
</table>

In [4]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import os
import time
import random
import tensorflow as tf

%env TF_FORCE_GPU_ALLOW_GROWTH=true
# Making sure we cache the models and are not downloaded all the time
%env TFHUB_CACHE_DIR=./tfhub_modules

2023-01-19 17:02:00.370093: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-19 17:02:00.961430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-01-19 17:02:00.961477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


env: TF_FORCE_GPU_ALLOW_GROWTH=true
env: TFHUB_CACHE_DIR=./tfhub_modules


## Using pre-trained ELMo Model

### Downloading the ELMo Model from TFHub

In [5]:
import tensorflow_hub as hub
import tensorflow.keras.backend as K

# Remove any ongoing sessions
K.clear_session()

# Download the ELMo model and save to disk
elmo_layer = hub.KerasLayer("https://tfhub.dev/google/elmo/3", signature="tokens",signature_outputs_as_dict=True)



2023-01-19 17:02:01.927944: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 17:02:01.931836: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 17:02:01.932434: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 17:02:01.933182: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

### Formatting the input for ELMo

ELMo expects the inputs to be in a specific format. Here we write a function to get the input in that format.

In [6]:
def format_text_for_elmo(texts, lower=True, split=" ", max_len=None):
    
    """ Formats a given text for the ELMo model (takes in a list of strings) """
        
    token_inputs = [] # Maintains individual tokens
    token_lengths = [] # Maintains the length of each sequence
    
    max_len_inferred = 0 # We keep a variable to matain the max length of the input
    
    # Go through each text (string)
    for text in texts:    
        
        # Process the text and get a list of tokens
        tokens = tf.keras.preprocessing.text.text_to_word_sequence(text, lower=lower, split=split)
        
        # Add the tokens 
        token_inputs.append(tokens)                   
        
        # Compute the max length for the collection of sequences
        if len(tokens)>max_len_inferred:
            max_len_inferred = len(tokens)
    
    # It's important to make sure the maximum token length is only as large as the longest input in the sequence
    # You can't have arbitrarily large length as the maximum length. Otherwise, you'll get this error.
    #InvalidArgumentError:  Incompatible shapes: [2,6,1] vs. [2,10,1024]
    #    [[node mul (defined at .../python3.6/site-packages/tensorflow_hub/module_v2.py:106) ]] [Op:__inference_pruned_3391]
    
    # Here we make sure max_len is only as large as the longest input
    if max_len and max_len_inferred < max_len:
        max_len = max_len_inferred
    if not max_len:
        max_len = max_len_inferred
    
    # Go through each token sequence and modify sequences to have same length
    for i, token_seq in enumerate(token_inputs):
        
        token_lengths.append(min(len(token_seq), max_len))
        
        # If the maximum length is less than input length, truncate
        if max_len < len(token_seq):
            token_seq = token_seq[:max_len]            
        # If the maximum length is greater than or equal to input length, add padding as needed
        else:            
            token_seq = token_seq+[""]*(max_len-len(token_seq))
                
        assert len(token_seq)==max_len
        
        token_inputs[i] = token_seq
    
    # Return the final output
    return {
        "tokens": tf.constant(token_inputs), 
        "sequence_len": tf.constant(token_lengths)
    }


print(format_text_for_elmo(["the cat sat on the mat", "the mat sat"], max_len=10))

{'tokens': <tf.Tensor: shape=(2, 6), dtype=string, numpy=
array([[b'the', b'cat', b'sat', b'on', b'the', b'mat'],
       [b'the', b'mat', b'sat', b'', b'', b'']], dtype=object)>, 'sequence_len': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([6, 3], dtype=int32)>}


In [7]:
# Titles of 001.txt - 005.txt in bbc/business
elmo_inputs = format_text_for_elmo([
    "Ad sales boost Time Warner profit",
    "Dollar gains on Greenspan speech",
    "Yukos unit buyer faces loan claim",
    "High fuel prices hit BA's profits",
    "Pernod takeover talk lifts Domecq"
])

# Get the result from ELMo
elmo_result = elmo_layer(elmo_inputs)

# Print the result
for k,v in elmo_result.items():    
    print(f"Tensor under key={k} is a {v.shape} shaped Tensor")

2023-01-19 17:02:04.956188: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8101


Tensor under key=lstm_outputs1 is a (5, 6, 1024) shaped Tensor
Tensor under key=elmo is a (5, 6, 1024) shaped Tensor
Tensor under key=lstm_outputs2 is a (5, 6, 1024) shaped Tensor
Tensor under key=sequence_len is a (5,) shaped Tensor
Tensor under key=word_emb is a (5, 6, 512) shaped Tensor
Tensor under key=default is a (5, 1024) shaped Tensor


## Generating Document Embeddings with ELMo

### Downloading the data

This code downloads a [BBC dataset](hhttp://mlg.ucd.ie/files/datasets/bbc-fulltext.zip) consisting of news articles published by BBC. 

In [8]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'


def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""
    
    # Create the data directory if not exist
    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')
    
    # If file doesnt exist, download
    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")
  
    extract_path = os.path.join(data_dir, 'bbc')
    
    # If data has not been extracted already, extract data
    if not os.path.exists(extract_path):        
        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)
    else:
        print("bbc-fulltext.zip has already been extracted")
    
download_data(url, 'data')

File already exists
bbc-fulltext.zip has already been extracted


### Read Data without Preprocessing 

Here we read all the files and keep them as a list of strings, where each string is a single article

In [9]:
def read_data(data_dir):
    
    # This will contain the full list of stories
    news_stories = []    
    filenames = []
    print("Reading files")
    
    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):
        
        for fi, f in enumerate(files):
            
            # We don't read the readme file
            if 'README' in f:
                continue
            
            # Printing progress
            i += 1
            print("."*i, f, end='\r')
            
            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as text_file:
                
                story = []
                # Read all the lines
                for row in text_file:
                                        
                    story.append(row.strip())
                    
                # Create a single string with all the rows in the doc
                story = ' '.join(story)                        
                # Add that to the list
                news_stories.append(story)  
                filenames.append(os.path.join(root, f))
                
        print('', end='\r')
        
    print(f"\nDetected {len(news_stories)} stories")
    return news_stories, filenames
                
  
news_stories, filenames = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### Check the length statistics 

Here we look at the 95-percientile in order to decide a good sequence length for inputs.

In [10]:
import pandas as pd

pd.Series([len(x.split(' ')) for x in news_stories]).describe(percentiles=[0.05, 0.95])

count    2225.000000
mean      388.837303
std       241.484273
min        91.000000
5%        164.200000
50%       336.000000
95%       736.800000
max      4489.000000
dtype: float64

### Compute the document embeddings

ELMo provides several outputs as the output (in the form of a dictionary). The most important output is in a key called `default` which is the averaged vector resulting from vectors produced for all the tokens in the input. We will use this as the document embedding.

In [11]:
batch_size = 4

news_elmo_embeddings = []

# Go through batches
for i in range(0, len(news_stories), batch_size):
    
    # Print progress
    print('.', end='')
    # Format ELMo inputs
    elmo_inputs = format_text_for_elmo(news_stories[i: min(i+batch_size, len(news_stories))], max_len=768)    
    # Get the result stored in default
    elmo_result = elmo_layer(elmo_inputs)["default"]
    # Add that to a list
    news_elmo_embeddings.append(elmo_result)

# Create an array
news_elmo_embeddings = np.concatenate(news_elmo_embeddings, axis=0)    

.............................................................................................................................................

2023-01-19 17:04:12.581481: W tensorflow/tsl/framework/bfc_allocator.cc:360] Garbage collection: deallocate free memory regions (i.e., allocations) so that we can re-allocate a larger region to avoid OOM due to memory fragmentation. If you see this message frequently, you are running near the threshold of the available device memory and re-allocation may incur great performance overhead. You may try smaller batch sizes to observe the performance impact. Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to disable this feature.


................................................................................................................................................................................................................................................................................................................................................................................................................................

### Save the embeddings to disk

In [12]:
# Save the data to disk
os.makedirs('elmo_embeddings', exist_ok=True)

pd.DataFrame(
    news_elmo_embeddings, index=filenames
).to_pickle(
    os.path.join('elmo_embeddings', 'elmo_embeddings.pkl')
)

In [13]:
pd.read_pickle(os.path.join('elmo_embeddings', 'elmo_embeddings.pkl'))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
data/bbc/tech/174.txt,0.012744,0.023062,-0.035946,-0.090556,0.144047,0.166769,0.017964,0.342860,-0.161670,-0.082526,...,-0.340656,0.244855,0.030709,0.253689,0.078406,0.044581,0.372656,-0.006501,0.273694,0.057726
data/bbc/tech/170.txt,0.256800,-0.076643,-0.005060,-0.063172,0.138440,0.189914,0.182261,0.326142,-0.117797,0.026969,...,-0.171903,0.165076,-0.113165,0.327153,0.209255,0.082908,0.258461,-0.122383,0.307110,-0.029122
data/bbc/tech/302.txt,0.178142,0.275478,-0.044081,0.069659,0.219834,0.160469,-0.008855,0.358277,0.126847,-0.048421,...,-0.290554,0.097650,-0.033120,0.317354,-0.011647,0.147325,0.441159,-0.065567,0.119717,-0.027908
data/bbc/tech/256.txt,0.113356,-0.150963,-0.054547,0.078308,0.010922,0.414599,-0.085480,0.392278,0.147020,-0.126910,...,-0.291242,0.131670,0.126649,0.161829,0.085308,0.041790,0.132907,-0.010835,0.489633,-0.170547
data/bbc/tech/211.txt,0.161718,0.072896,-0.033089,-0.112782,0.426878,0.299916,0.084742,0.284771,-0.164555,0.051139,...,-0.091179,0.205737,-0.067422,0.088594,0.118239,0.079229,0.402150,-0.011402,0.355481,-0.067534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
data/bbc/entertainment/015.txt,-0.188401,-0.300901,0.023362,-0.128732,0.306804,0.241635,0.068264,0.128954,-0.128111,-0.200275,...,-0.253069,0.376614,-0.215489,0.362154,-0.009835,-0.058118,0.197030,0.056924,0.493857,-0.102432
data/bbc/entertainment/345.txt,-0.178258,-0.212788,0.019289,-0.097152,0.224833,0.122836,-0.037274,-0.269267,-0.330137,-0.047496,...,-0.123795,0.075508,-0.240921,0.167021,0.135244,0.192209,0.115502,-0.237828,0.562652,-0.099549
data/bbc/entertainment/024.txt,-0.376309,-0.317978,0.054523,-0.213948,0.229501,0.119958,0.037880,-0.051839,-0.106681,-0.058162,...,-0.211875,0.328167,-0.178842,0.107765,-0.138602,-0.028439,0.088771,-0.069143,0.444389,-0.067932
data/bbc/entertainment/299.txt,0.129563,-0.149498,-0.086580,-0.084570,0.133638,0.179378,0.126872,0.171507,0.190518,-0.126271,...,-0.183596,0.280337,-0.227419,0.318273,0.012311,0.118652,0.127632,-0.050924,0.435821,-0.108651


In [14]:
endTime = time.time()
elapsedTime = time.strftime("%H:%M:%S", time.gmtime(endTime - startTime))

print(todaysDate.strftime('# Run Date: %A, %B %d, %Y'))
print(f"# Run Time: {elapsedTime}")

# Run Date: Thursday, January 19, 2023
# Run Time: 00:07:17
