# Creating a Baseline Tag Labeler

Here we will use [XGBoost](https://xgboost.readthedocs.io/en/latest/python/python_api.html) and [scikit-learn](https://scikit-learn.org/stable/) to create a baseline multi-class, multi-label classifier that will label our sample of [Stack Overflow](http://stackoverflow.com) posts (questions and their answers), two thirds of which lack labels. This will serve as a basis of comparison for the deep network we will train therafter. We will create separate [embeddings](https://keras.io/layers/embeddings/) of their language and code and use these as the signal for our model.

In [1]:
import pandas as pd
import numpy as np
import re

#### Load our sample of 100K questions/answers

In [2]:
posts_df = pd.read_json('data/stackoverflow/sample/CombinedDocs.100K.Questions.jsonl.gz', lines=True)
print('Original posts: {:,}'.format(len(posts_df.index)))

Original posts: 273,876


#### Drop posts without tags

In [3]:
tag_posts = posts_df.dropna(axis=0, subset=['_Tags'])
print('Posts w/ tags: {:,}'.format(len(tag_posts.index)))

Posts w/ tags: 98,278


#### Strip tag names from their XML tags

In [4]:
tags = tag_posts['_Tags'].apply(lambda x: re.findall('\<(.+?)\>', x))
len(tags)

98278

#### Determine which tags to predict

We choose to limit tags to those with at least 50 occurrences, which means 709 labels to predict.

In [5]:
from collections import defaultdict
# import seaborn as sns

tag_counts = defaultdict(int)

for row in tags:
    for tag in row:
        tag_counts[tag] += 1

for i in [0, 10, 20, 50, 100, 1000]:
    filtered_tags = list(filter(lambda x: x > i, tag_counts.values()))
    print('There are {:,} tags with more than {:,} count'.format(len(filtered_tags), i))
    # t = pd.Series(filtered_tags, name="Tag Count")
    # ax = sns.distplot(t)

MIN_TAGS = 50

There are 18,549 tags with more than 0 count
There are 2,730 tags with more than 10 count
There are 1,589 tags with more than 20 count
There are 709 tags with more than 50 count
There are 367 tags with more than 100 count
There are 33 tags with more than 1,000 count


#### Compute maps between tag ID and labels, and a list of labels with > 50 instances

These will be used when presenting results and their tag-wise performance below.

In [6]:
all_tags = set()
for row in tags:
    for tag in row:
        if tag_counts[tag] > MIN_TAGS:
            all_tags.add(tag)
print('Total unique tags with {:,} occurrences: {:,}'.format(MIN_TAGS, len(all_tags)))
sorted_all_tags = sorted(all_tags)

tag_to_id = {val:i for i, val in enumerate(sorted_all_tags)}
id_to_tag = {i:val for i, val in enumerate(sorted_all_tags)}

Total unique tags with 50 occurrences: 709


#### Create a matrix of tags 709 elements wide, one for each tag

In [7]:
labels = []
tag_list = tags.tolist()

# Loop through every post...
for i, tag_set in enumerate(tags.tolist()):
    # Then build a 709 element wide list for each tag present
    label_row = []
    for tag in sorted_all_tags:
        if tag in tag_list[i]:
            label_row.append(1)
        else:
            label_row.append(0)
    labels.append(label_row)
    
tag_labels = [id_to_tag[key_id] for key_id in sorted(id_to_tag.keys()) if tag_counts[id_to_tag[key_id]] > MIN_TAGS]

len(labels), len(labels[0])

(98278, 709)

#### Use `BeautifulSoup` to extract language and code from posts separately

Note that what we really need are character level embeddings for code... but we'll start with word embeddings.

Once we remove infrequent words we need to filter empty posts and create a new set of labels matching the indexes of the unfiltered posts.

In [8]:
from bs4 import BeautifulSoup

MIN_CODE = 20

def extract_code(x):
    doc = BeautifulSoup(x)
    return '\n'.join([d.text.lower() for d in doc.find_all('code')])
    
post_code_text = tag_posts._Body.apply(extract_code)
post_code_words = [x.split() for x in post_code_text.tolist()]

# Count the words for testing
code_word_counts = defaultdict(int)
for post in post_code_words:
    for word in post:
        code_word_counts[word] += 1

# Take words with > MIN_CODE (20) instances
post_code_words = [[y for y in x if code_word_counts[y] > MIN_CODE] for x in post_code_words]

# Create a new list of labels to match the new non-empty lists of words
code_post_ids = defaultdict(bool)
filtered_code_words = []
for i, post in enumerate(post_code_words):
    if len(post) == 0:
        pass
    else:
        code_post_ids[i] = True
        filtered_code_words.append(post)
        
del post_code_words
len(filtered_code_words)

72019

In [9]:
new_labels = []
for post_id in code_post_ids.keys():
    label_set = labels[post_id]
    if code_post_ids[post_id]:
        new_labels.append(label_set)

del labels
len(new_labels)

72019

In [10]:
# Validate the posts match the labels
assert(len(filtered_code_words) == len(new_labels))
print('We are left with {:,} example posts'.format(len(filtered_code_words)))

We are left with 72,019 example posts


#### Sample the data to speed development

In [11]:
import random
random.seed(33)

SAMPLE_SIZE = 10000
id_list = list(range(0, len(filtered_code_words)))
idx = random.sample(id_list, SAMPLE_SIZE)

# idx = np.random.choice(np.arange(len(matrix_posts)), SAMPLE_SIZE, replace=False)

sampled_posts = [x for i, x in enumerate(filtered_code_words) if i in idx]
sampled_labels = [x for i, x in enumerate(new_labels) if i in idx]

del filtered_code_words
del new_labels

len(sampled_posts), len(sampled_labels)

(10000, 10000)

#### REMINDER: When we add text words we must combine the two valid label lists and then create a new list of labels

In [12]:
# MIN_TEXT = 20

# def extract_text(x):
#     doc = BeautifulSoup(x)
#     codes = doc.find_all('code')
#     [code.extract() if code else None for code in codes]
#     return doc.text

# post_text = tag_posts._Body.apply(extract_text)
# post_text_words = [x.split() for x in post_text.tolist()]

# # Take words with > MIN_TEXT (20) instances
# post_text_words = [[y for y in x if tag_counts[y] > MIN_TEXT] for x in post_text_words]

# # Create a new list of labels to match the new non-empty lists of words
# text_post_ids = defaultdict(bool)
# text_post_id_list = []
# for i, post in enumerate(post_text_words):
#     if len(post) == 0:
#         pass
#     else:
#         text_post_ids[i] = True
#         text_post_id_list.append(i)

#### Encode the tags, replacing their string form with their respective IDs

In [13]:
# encoded_tags = []
# raw_tags = []
# for tagset in coded_tags:
#    encoded_tags.append([1 if id in tagset else 0 for id in id_to_tag.keys()])

# labels = np.array(encoded_tags)

# encoded_tags[0]

## Create a Baseline Gradient Boosted Decision Tree Model

It is useful to have a decision tree model to use as a baseline for comparison with our deep network model. XGBoost's implementation of gradient boosted decision trees is state of the art for this kind of application, but it can't do multi-class, multi-label classification. Therefore we use an [`xgboost.XGBClassifier`](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) with an [`sklearn.multiclass.OneVsRestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) on top of the XGBoost classifier to train one classifier model per label and then apply them to each label to compute the output for each.

We define `VOCAB_SIZE`, `MAX_LENGTH` and `TEST_SPLIT` to define the number of unique words as input into our embedding, the sequence length for each input, and the test/train split for our performance testing.

#### Encode the data using Gensim and Word2Vec

For the network, we'll create our own embeddings. For the baseline model we'll use Word2Vec.

In [14]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

from gensim.sklearn_api import W2VTransformer
from gensim.models import Word2Vec

import xgboost as xgb

VOCAB_SIZE = 2000
MAX_LENGTH = 40
EMBEDDING_SIZE = 32

TEST_SPLIT = 0.2

w2v_model = Word2Vec(
    sampled_posts,
    size=EMBEDDING_SIZE,
    min_count=1,
    window=10,
    workers=10,
    iter=10,
    seed=33
)

In [15]:
w2v_model.wv.most_similar(positive='exception')

[('thread', 0.8380829691886902),
 ('exception:', 0.8132432699203491),
 ('reason:', 0.8094411492347717),
 ('unhandled', 0.8072988986968994),
 ('thrown', 0.8006988763809204),
 ('uncaught', 0.7943539619445801),
 ('"main"', 0.787979245185852),
 ('timed', 0.787109375),
 ('asynctask', 0.7860561609268188),
 ('threw', 0.7687987089157104)]

In [16]:
encoded_posts = [[w2v_model.wv[word] for word in post] for post in sampled_posts]
len(encoded_posts)

10000

### Pad and limit the posts to MAX_LENGTH (20) words using the average of all words in the corpus

We will now compute a position-wise maximum and minimum, concatenate these values, and use them to pad any documents with less than 20 words. We will simultaneously truncate any documents with more than 20 words. If we were creating our own embeddings using keras we would use [`keras.preprocessing.pad_sequences`](https://keras.io/preprocessing/sequence/#pad_sequences), but using [`gensim.models.word2vec`](https://radimrehurek.com/gensim/models/word2vec.html) we pad them on our own.

See [Representation learning for very short texts using weighted word embedding aggregation](https://arxiv.org/pdf/1607.00570.pdf) referenced from [Stack Overflow](https://datascience.stackexchange.com/a/17348/59975).

In [17]:
from math import ceil

padded_posts = []
for post in encoded_posts:
    # Pad short posts with alternating min/max
    if len(post) < MAX_LENGTH:
        pointwise_min = np.minimum.reduce(post)
        pointwise_max = np.maximum.reduce(post)
        padding = [pointwise_max, pointwise_min]
        
        post += padding * ceil((MAX_LENGTH - len(post) / 2.0))
        
    # Shorten long posts or those odd number length posts we padded to 51
    if len(post) > MAX_LENGTH:
        post = post[:MAX_LENGTH]
      
    padded_posts.append(post)

# Verify their lengths
assert(min([len(post) for post in padded_posts]) == MAX_LENGTH)
assert(max([len(post) for post in padded_posts]) == MAX_LENGTH)
assert(min([len(label) for label in sampled_labels]) == 709)
assert(max([len(label) for label in sampled_labels]) == 709)

# Free up the RAM, since we copied the data
del encoded_posts
len(padded_posts), len(padded_posts[0])

(10000, 40)

In [18]:
len(sampled_labels), len(sampled_labels[0])

(10000, 709)

#### Convert the 3D feature array into a wider 2D array

The classifier requires 2D data, so we need to convert our 3D feature array into a wider 2D feature array. We will do this by iterating through the 50 padded elements of Word2Vec vectors for each post and appending them to a long list for each post.

Note that the type of `padded_posts` is `list(list(np.array))`, an artifact of the Word2Vec mapping.

#### Create one Row per Label Column

Training a `sklearn.multiclass.OneVsRestClassifier` with one `xgboost.XGBClassifier` per label exceeded 64GB of RAM and so we are remapping the data to have one instance of the row for each label column in a given row.

For example:

```python
# Input
rows, labels = [0.1, 0.3, 0.4, ...],[0,1,0,1]

# Output
rows_w_labels = [
    ([0.1, 0.3, 0.4, ...], 0),
    ([0.1, 0.3, 0.4, ...], 1),
    ([0.1, 0.3, 0.4, ...], 0),
    ([0.1, 0.3, 0.4, ...], 1)
]
```

In [19]:
import cupy as cp

row_length = MAX_LENGTH * EMBEDDING_SIZE

matrix_posts = []
flat_labels = []
print_shape = True
print(len(padded_posts), len(sampled_labels))
for i, (post, labels) in enumerate(zip(padded_posts, sampled_labels)):
    # Starting with an empty array and append the entire list of embedded words to it, 
    # expanding it's shape to (5000,)
    post = cp.array(post)
    if print_shape:
        print(post.shape)
    post_row = cp.concatenate(post, axis=0)
    if print_shape:
        print(post_row.shape)
    assert(post_row.shape == (row_length,))
    
    # Now add a downward dimension to the data, expanding its dimensions to (1,5000)
    post_row = cp.expand_dims(post_row, axis=0)
    if print_shape:
        print(post_row.shape)
    assert(post_row.shape == (1,row_length))
    
    if print_shape:
        print(len(sampled_labels), len(sampled_labels[0]))
    
    # Sample the labels to see which to emit. I should really do this by relative frequency.
    SAMPLE_SIZE = 50
    id_list = list(range(0, 709))
    idx = random.sample(id_list, SAMPLE_SIZE)
    for j, label in enumerate(sampled_labels[i]):
        if print_shape:
            print(i, j, label, type(label))
            print_shape = False
        
        if j in idx:
            matrix_posts.append(post_row)
            flat_labels.append(label)
        else:
            continue

# Memory conservation is critical
del padded_posts
len(matrix_posts), len(flat_labels)

10000 10000
(40, 32)
(1280,)
(1, 1280)
10000 709
0 0 0 <class 'int'>


(500000, 500000)

#### Sample the Data Once Again

In [20]:
# SAMPLE_SIZE = 1000
# id_list = list(range(0, len(matrix_posts)))
# idx = random.sample(id_list, SAMPLE_SIZE)
# sampled_posts = [post for i, post in enumerate(matrix_posts) if i in idx]
# sampled_labels = [label for i, label in enumerate(flat_labels) if i in idx]

# del matrix_posts
# del flat_labels
# len(sampled_posts), len(sampled_labels)

#### Convert from GPU `cupy.ndarray` to main memory `numpy.ndarray`

In [21]:
matrix_posts = cp.asnumpy(cp.concatenate(matrix_posts, axis=0))

### Train the Baseline Model

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    matrix_posts,
    flat_labels,
    test_size=TEST_SPLIT,
    random_state=33
)
del matrix_posts
del flat_labels

In [23]:
print(X_train.shape, X_test.shape, len(y_train), len(y_test))
print(X_train.dtype, X_test.dtype, type(y_train), type(y_test))
print(type(X_train))

(400000, 1280) (100000, 1280) 400000 100000
float32 float32 <class 'list'> <class 'list'>
<class 'numpy.ndarray'>


In [24]:
from scipy import sparse

X_train = sparse.csr_matrix(X_train)

params = {
    'booster': 'gbtree',
    'silent': 0,
}

clf = OneVsRestClassifier(
    xgb.XGBClassifier(
        learning_rate=0.2,
        n_estimators=100,
        objective='binary:logistic',
        nthread=1,
        tree_method='gpu_hist'
    ), 
    n_jobs=1,
)
%timeit clf.fit(X_train[:200000], y_train[:200000])

29.8 s ± 312 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(
#     n_estimators=100,
#     max_depth=3,
#     random_state=33,
#     n_jobs=12
# )
# clf.fit(X_train, y_train)

In [26]:
%%bash

echo 'Fitting done!' | ~/bin/twilio-sms 404-317-3620

Sending SMS to 404-317-3620 from 678-264-3702...done


In [27]:
# d_train = xgb.DMatrix(X_train, label=y_train)
# d_test =  xgb.DMatrix(X_test, label=y_test)

In [28]:
# from sklearn.model_selection import cross_val_score

# cross_val_score(clf, X_train, y_train, cv=2, scoring='accuracy')

KeyboardInterrupt: 

In [30]:
from sklearn.metrics import (
    roc_curve, precision_recall_curve, auc, make_scorer, recall_score, 
    accuracy_score, jaccard_score, precision_score, confusion_matrix
)

y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=False, target_names=['yes', 'no'])#tag_labels)

  'precision', 'predicted', average, warn_for)


In [None]:
print(report)

              precision    recall  f1-score   support

         yes       1.00      1.00      1.00     99697
          no       0.00      0.00      0.00       303

    accuracy                           1.00    100000
   macro avg       0.50      0.50      0.50    100000
weighted avg       0.99      1.00      1.00    100000



In [None]:
accuracy_score(y_test, y_pred)

jaccard_score(y_test, y_pred)


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold

plt.style.use("ggplot")

xgb_params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

In [None]:
from keras import Sequential
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True)
tokenizer.fit_on_texts(post_code)
sequences = tokenizer.texts_to_sequences(post_code)
X = pad_sequences(sequences, maxlen=MAX_LENGTH)
X.shape

model = Sequential()
model.add(Embedding(1000, 64, input_length=MAX_LENGTH))


In [None]:
model