# Deep Products: Deep Tag Labeler

This is the first project for the book Deep Products, about using NLP and weakly supervised learning to build complete machine learning products. Using the non-code text of Stack Overflow posts (question and answers) to tag them using a multi-class, multi-label classifier using LSTMs and Emlo embeddings.

## Multi-Input Strategy

This is a second attempt using multiple balanced data inputs on multiple models that combine into the final output.

In [2]:
import os
import re

from keras import backend as K
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm_notebook

Using TensorFlow backend.


## Load 14 Million Answered Questions from Stack Overflow

We load all answered questions from Stack Overflow. This data was converted from XML to JSON and then sampled using Spark on a single `r5.12xlarge` machine cluster with [code/stackoverflow/sample_json.spark.py](stackoverflow/sample_json.spark.py).

In [4]:
posts_df = pd.read_parquet(
    'data/stackoverflow/parquet/Questions.Answered.parquet',
    columns=['_Body', '_Tags'],
    filters=[('_Tags','!=',None),],
    engine='pyarrow'
)
posts_df.head(5)

Unnamed: 0,_Body,_Tags
0,<p>I want to use a track-bar to change a form'...,<c#><floating-point><type-conversion><double><...
1,<p>I have an absolutely positioned <code>div</...,<html><css><css3><internet-explorer-7>
2,<p>Given a <code>DateTime</code> representing ...,<c#><.net><datetime>
3,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...
4,<p>Is there a standard way for a web server to...,<html><browser><timezone><user-agent><timezone...


In [6]:
posts_df = posts_df.head(1000000)
posts_df.head(5)

Unnamed: 0,_Body,_Tags
0,<p>I want to use a track-bar to change a form'...,<c#><floating-point><type-conversion><double><...
1,<p>I have an absolutely positioned <code>div</...,<html><css><css3><internet-explorer-7>
2,<p>Given a <code>DateTime</code> representing ...,<c#><.net><datetime>
3,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...
4,<p>Is there a standard way for a web server to...,<html><browser><timezone><user-agent><timezone...


## Drop Unlabeled Posts

Note: these have already been filtered to remove untagged questions, so there are from 1-5 labels per post.

In [None]:
tag_posts = posts_df.dropna(axis=0, subset=['_Tags'])
print('Posts w/ tags: {:,}'.format(len(tag_posts.index)))
tag_posts.head(5)

## Extract the Tags from their XML tags

In [14]:
tag_posts['_Tag_List'] = tag_posts['_Tags'].apply(lambda x: re.findall('\<(.+?)\>', x))

## Try Different Thresholds for Filtering Tags by Frequency

The higher the threshold, the fewer classes, the less sparse the data, the easier the learning task.

In [15]:
from collections import defaultdict

tag_counts = defaultdict(int)

for row in tag_posts['_Tag_List']:
    for tag in row:
        tag_counts[tag] += 1

for i in [0, 10, 20, 50, 100, 1000, 5000]:
    filtered_tags = list(filter(lambda x: x > i, tag_counts.values()))
    print('There are {:,} tags with more than {:,} count'.format(len(filtered_tags), i))

MIN_TAGS = 5000

record_count = len([i for i in filter(lambda x: x > MIN_TAGS, tag_counts.values())])
record_count

There are 23,795 tags with more than 0 count
There are 9,706 tags with more than 10 count
There are 6,660 tags with more than 20 count
There are 3,901 tags with more than 50 count
There are 2,508 tags with more than 100 count
There are 416 tags with more than 1,000 count
There are 70 tags with more than 5,000 count


70

## Map from Tags to IDs

In [16]:
all_tags = set()
for row in tag_posts['_Tag_List']:
    for tag in row:
        if tag_counts[tag] > MIN_TAGS:
            all_tags.add(tag)

print('Total unique tags with {:,} occurrences: {:,}'.format(MIN_TAGS, len(all_tags)))

Total unique tags with 5,000 occurrences: 70


## One Hot Encode Tag Lists

In [17]:
labels = []

print(len(tag_posts.index))
# Loop through every post...
for i, tag_set in enumerate(tag_posts['_Tag_List'].tolist()):
    # Then build a record_count element wide list for each tag present
    label_row = []
    for tag in tag_set:
        if tag in all_tags:
            label_row.append(tag)
    labels.append(label_row)
    
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)
one_hot_labels = mlb.fit_transform(labels)
one_hot_labels.shape

1000000


(1000000, 70)

## Split the Rows Up Into Samples Per Label

Create a dataset defined by an *index* of the `DataFrame` for each and every label column with balanced 0/1 labels.

In [None]:
# Get a count of the number of examples/tags and the total positives for each label
length = one_hot_labels.shape[0]
width = one_hot_labels.shape[1]
total_positives = one_hot_labels.sum(axis=0)

# Build indexes of positive and negative values for each label
positive_indexes = []
negative_indexes = []
for column in one_hot_labels.T:
    positive_indexes.append([i for i, val in enumerate(column) if val == 1])
    negative_indexes.append([i for i, val in enumerate(column) if val == 0])   

from sklearn.utils import shuffle

per_class_training_dfs = []
for i in range(0, width):
    # Get the training rows for the positive indexes and then an equal number from the negative indexes
    pos_example_count = len(positive_indexes[i])
    sample_negative_indexes = negative_indexes[i][0:pos_example_count]
    
    combined_index = sorted(positive_indexes[i] + sample_negative_indexes)
    combined_examples = tag_posts.loc[combined_index]

    bin_label_series = pd.Series(one_hot_labels.T[i][combined_index], index=combined_index)
    print(tag_posts.index)
    break
    #combined_examples['_Bin_Labels'] = bin_label_series
    
    # Now get the labels that correspond to the combined examples
    # combined_examples['_Bin_Labels'] one_hot_labels.T[i][0]
    #labels = one_hot_labels[combined_index]
    #combined_examples['_Bin_Labels'] = labels
    
    #print(combined_examples[['_Tag_List','_Bin_Labels']])

    #break
    
#     # Now reset indexes on both examples and labels
#     combined_examples.reset_index(inplace=True)
#     #print(combined_examples['_Tags'])
#     #print(labels)
#     print(positive_examples['_Tags'])
    
#     #per_class_bin_labels.append(labels)
        
    #per_label_training_dfs.append(combined_examples)
    #print(one_hot_labels.T[i][positive_indexes[i]])

#per_label_training_dfs[0]
bin_label_series

## Extract/Tokenize Non-Code Text from Posts

We leave posts' source code out for now because it will need a different embedding and thus multiple inputs.

In [None]:
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

MAX_LEN = 150
PAD_TOKEN = '__PAD__'
BATCH_SIZE = 32

def extract_text(x):
    """Extract non-code text from posts (questions/answers)"""
    doc = BeautifulSoup(x)
    codes = doc.find_all('code')
    [code.extract() if code else None for code in codes]
    tokens = doc.text.split()
    padded_tokens = [tokens[i] if len(tokens) > i else PAD_TOKEN for i in range(0,MAX_LEN)]
    return padded_tokens

post_text = tag_posts._Body.apply(extract_text).reset_index(drop=True)
post_text.head(5)

In [None]:
len(post_text.index), len(post_text.iloc[0]), len(labels), len(labels[0])

In [None]:
# Validate the posts match the labels
assert(len(post_text.index) == len(labels))
print('We are left with {:,} example posts'.format(len(post_text.index)))

## Make Record Count a Multiple of the Batch Size and Post Sequence Length

The Elmo embedding requires that the number of records be a multiple of the batch size times the number of tokens in the padded posts.

In [None]:
import math

# Filter label rows that don't have any positive labels
label_mx = np.array(labels)
max_per_row = label_mx.max(axis=1)
non_zero_index = np.nonzero(max_per_row)[0]

label_mx = label_mx[non_zero_index]

# Filter the posts to match
post_text = post_text[post_text.index.isin(non_zero_index)]
post_text = np.array(post_text.tolist())

assert(post_text.shape[0] == label_mx.shape[0])
print('Unfiltered Counts: {:,} {:,}'.format(post_text.shape[0], label_mx.shape[0]))

# training_count must be a multiple of the BATCH_SIZE times the MAX_LEN for the Elmo embedding layer
highest_factor = math.floor(post_text.shape[0] / (BATCH_SIZE * MAX_LEN))
training_count = highest_factor * BATCH_SIZE * MAX_LEN
print('Highest Factor: {:,} Training Count: {:,}'.format(highest_factor, training_count))

label_mx = label_mx[0:training_count]
post_text = post_text[0:training_count]

assert(post_text.shape[0] == label_mx.shape[0])
print('Final Counts: {:,} {:,}'.format(post_text.shape[0], label_mx.shape[0]))

In [None]:
post_text

## Create an Elmo Embedding Layer using Tensorflow Hub

Note that this layer takes a padded two-dimensional array of strings.

In [None]:
# From https://www.depends-on-the-definition.com/named-entity-recognition-with-residual-lstm-and-elmo/

sess = tf.Session()
K.set_session(sess)

elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(BATCH_SIZE*[MAX_LEN])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

## Experimental Setup

We `train_test_split` rather than k-fold cross validate because it is too expensive.

In [None]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = 0.1

X_train, X_test, y_train, y_test = train_test_split(
    post_text,
    label_mx,
    test_size=TEST_SPLIT,
    random_state=34
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Create an LSTM Model to Classify Posts into Tags

We use the padded/tokenized posts as input, an Elmo embedding feeding an Long-Short-Term-Memory (LSTM) layer followed by a Dense layer with the same number of output neurons as our tag list.

We use focal loss as a loss function, which is used in appliations like object detection, because it 

In [None]:
from keras.layers import Input, concatenate, Activation, Dense, LSTM, BatchNormalization, Embedding, Dropout, Lambda, Bidirectional
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy
from keras.models import Model
from keras.optimizers import Adam
from keras_metrics import precision, f1_score, false_negative, true_positive, false_positive, true_negative

# Text model
text_input = Input(shape=(MAX_LEN,), dtype=tf.string)

elmo_embedding = Lambda(ElmoEmbedding, output_shape=(MAX_LEN, 1024))(text_input)

text_lstm = LSTM(
    input_shape=(MAX_LEN, 1024,),
    units=512,
    recurrent_dropout=0.2,
    dropout=0.2)(elmo_embedding)

text_dense = Dense(200, activation='relu')(text_lstm)

text_output = Dense(record_count, activation='sigmoid')(text_dense)
text_batch = BatchNormalization()(text_output)

text_model = Model(
    inputs=text_input, 
    outputs=text_batch
)

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def focal_loss(y_true, y_pred):
    gamma = 2.0
    alpha = 0.25
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))

def abs_KL_div(y_true, y_pred):
    y_true = K.clip(y_true, K.epsilon(), None)
    y_pred = K.clip(y_pred, K.epsilon(), None)
    return K.sum(K.abs( (y_true - y_pred) * (K.log(y_true / y_pred))), axis=-1)

from keras.optimizers import Adam
adam = Adam(lr=0.0005)

text_model.compile(
    loss='binary_crossentropy',
    optimizer=adam,
    metrics=[
        precision_m,
        recall_m,
        f1_m,
        'mae',
        abs_KL_div,
        #'accuracy'
    ]
)

text_model.summary()

## Compute Sample Weights

Because we have skewed classes and multiple classes per example, we employ sample weights which weight the importance of each row according to the relative frequency of their labels.

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

train_sample_weights = compute_sample_weight('balanced', y_train)
test_sample_weights = compute_sample_weight('balanced', y_test)

In [None]:
class_weights = {}
for i, tag in enumerate(sorted_all_tags):
    class_weights[i] = label_mx.shape[0] / tag_counts[tag]

class_weights

In [None]:
from keras.callbacks import EarlyStopping

EPOCHS = 4

history = text_model.fit(
    X_train,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[
        EarlyStopping(monitor='loss', patience=1, min_delta=0.0001),
        EarlyStopping(monitor='val_loss', patience=1, min_delta=0.0001),
    ],
    sample_weight=train_sample_weights,
    validation_data=(X_test, y_test)
)

In [None]:
accr = text_model.evaluate(X_test, y_test, sample_weight=test_sample_weights)
[i for i in zip(accr, text_model.metrics_names)]

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

print(history.history)
# summarize history for accuracy
plt.plot(history.history['val_loss'])
plt.plot(history.history['val_abs_KL_div'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import hamming_loss, jaccard_score

import keras.backend as K
import tensorflow as tf

y_pred = text_model.predict(X_test)

sess = tf.Session()
with sess.as_default():
    for cutoff in [0.0001, 0.001, 0.01, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8]:
        y_pred_bin = K.greater(y_pred, cutoff).eval()
        print('Cutoff: {:,}'.format(cutoff))
        print('Hamming loss: {:,}'.format(
            hamming_loss(y_test, y_pred_bin)
        ))
        for j_type in ['micro', 'macro', 'weighted']:
            print('Jaccard {} score: {:,}'.format(
                j_type,
                jaccard_score(y_test, y_pred_bin, average=j_type)
            ))
        print('')

In [None]:
y_pred

In [None]:
# From https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python
def entropy(labels, base=None):
    """ Computes entropy of label distribution. """

    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    value,counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = e if base is None else base
    for i in probs:
        ent -= i * log(i, base)

    return ent

entropy(y_pred[:,0])

In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix

y_pred = text_model.predict(X_test, batch_size=32, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_bool))

print(multilabel_confusion_matrix(y_test, y_pred_bool))