### classify stackoverflow posts by considering each document a weighted mean of its individual word embeddings, then using individual tag probabilities to assign tags to samples

In [1]:
import csv
import os
import re
import sys
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D
from keras.models import Model, Sequential

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# my stuff in the helpers/ directory
from helpers import (
    embeddings_helper,
    files_helper,
    metrics_helper,
    tags_helper,
    texts_helper)    

Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

89

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small()

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM=100
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=32
NUM_EPOCHS=10
TOKENIZER_FILTERS='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' # default keras filters

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
# [inverse_word_index[idx] for idx in sequences[0]]

In [8]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [9]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [10]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [11]:
# tag position => fraction of docs having that tag
# snooping
tag_incidence_index = tags_helper.get_incidence_index(binary_labels)

In [12]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_
idf_index = dict(zip(vect.get_feature_names(), idf))

embeddings_index = files_helper.read_glove_weighted(
    d=EMBEDDING_DIM,
    weight_index=idf_index)

overall, 10787 out of 18699 embeddings were weighted. Total available embeddings: 400000


In [13]:
embedding_matrix = embeddings_helper.build_embedding_matrix(
    word_index, 
    embeddings_index, 
    MAX_NB_WORDS, 
    EMBEDDING_DIM)

In [14]:
embedding_matrix.shape

(194869, 100)

In [15]:
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels_1 = binary_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-num_validation_samples]
Y_train = labels_1[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = labels_1[-num_validation_samples:]

data.shape,labels_1.shape

((32000, 1000), (32000, 1209))

In [16]:
%%time

num_labels = labels_1.shape[1]

model = Sequential()

model.add(Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           trainable = False))

model.add(GlobalAvgPool1D())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam')

# loss doesn't get better after 5 epochs
model.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=(X_val,Y_val))

Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 47.6 s, sys: 3.95 s, total: 51.6 s
Wall time: 34.3 s


In [17]:
Y_train_pred = []

for x_train in X_train:
    y_train_pred = model.predict(x_train.reshape(1,-1))
    Y_train_pred.append(y_train_pred)

Y_train_pred = np.vstack(Y_train_pred)

In [18]:
tag_probability_index = tags_helper.get_probability_index(Y_train_pred)

In [19]:
def get_scores(predicted_probabilities, predicted_indices,debug=False):
    """
    predicted_probabilities (output of n sigmoid output units)
    predicted_indices (according to some strategy, e.g. static threshold, etc)
    """
    
    # indices that are turned on (equal 1)
    active_indices = predicted_indices.ravel().nonzero()[0]
    
    if debug:
        print("active_indices: {0}".format(active_indices))
    
    scores_for_active_indices = predicted_probabilities[active_indices]
    
    scores_for_active_indices_as_matrix = scores_for_active_indices.reshape(1,-1)
    
    return scores_for_active_indices_as_matrix
    

In [40]:
# sample result for a couple of test cases
num_test_cases = 10
STRATEGY= 'relative_difference_wrt_estimated_tag_probability'
THRESHOLD=0.0
PROB_INDEX = tag_probability_index
INCIDENCE_INDEX = tag_incidence_index
LIMIT=5

for i in np.random.randint(low=0, high=len(Y_val), size=num_test_cases):

    actual_label_indices = Y_val[i].reshape(1,-1)
    
#     print(actual_label_indices.ravel().nonzero()[0])
    
    actual_labels = lb.inverse_transform(actual_label_indices)
    actual_labels_tpl = actual_labels[0]
    
    predicted_tag_probabilities = model.predict(X_val[i].reshape(1,-1)).ravel()
              
    predicted_label_indices = tags_helper.get_tag_assignment(
        STRATEGY,
        predicted_tag_probabilities,
        probability_threshold=THRESHOLD,
        tag_probability_index=PROB_INDEX,
        tag_incidence_index=INCIDENCE_INDEX,
        limit=LIMIT)
         
    predicted_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        predicted_label_indices)
    
    actual_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        actual_label_indices)
    
    predicted_label_scores_arr = predicted_label_scores_mat.ravel()
    predicted_labels = lb.inverse_transform(predicted_label_indices)
    predicted_labels_tpl = predicted_labels[0]
      
    predicted_tags_and_scores = sorted(list(zip(predicted_labels_tpl,predicted_label_scores_arr)), key=lambda tpl: tpl[1], reverse=True)
    
    actual_label_scores_arr = actual_label_scores_mat.ravel()
    actual_tags_and_scores = list(zip(actual_labels_tpl,actual_label_scores_arr))

#     print(actual_labels)    
    print(actual_labels_tpl)
    print(actual_tags_and_scores)
    print(predicted_tags_and_scores)
    print('\n')


('database', 'sqlite', 'sqlite3')
[('database', 0.0093550459), ('sqlite', 0.0040667453), ('sqlite3', 0.00044535275)]
[('decimal', 0.00055970135), ('statistics', 0.00030605335), ('linq-to-entities', 0.00029505332), ('case', 0.00020693819), ('protocols', 0.00019685121)]


('android', 'client-server', 'server', 'synchronization')
[('android', 0.085020743), ('client-server', 0.00031716994), ('server', 0.00098405813), ('synchronization', 0.00020135252)]
[('gradle', 0.0016105613), ('jenkins', 0.0013448956), ('phonegap-plugins', 0.00054052169), ('vmware', 0.0004133372), ('cocoapods', 0.00027361105)]


('cocoa', 'osx', 'swift')
[('cocoa', 0.0008243788), ('osx', 0.0025663357), ('swift', 0.0044037555)]
[('java', 0.11521658), ('python', 0.063943155), ('sql', 0.025384475), ('sql-server', 0.013555542), ('algorithm', 0.0084099676)]


('python', 'twitter')
[('python', 0.055704534), ('twitter', 0.00079012767)]
[('iphone', 0.013477795), ('cordova', 0.0042445827), ('video', 0.0010379954), ('powershell-v

In [41]:
X_test_list = []

for validation_features in X_val:
    X_test_list.append(validation_features.reshape(1,-1))

X_test = np.vstack(X_test_list)    

In [42]:
Y_pred_lst = [model.predict(x_test.reshape(1,-1)) for x_test in X_test]

In [43]:
Y_pred_calculated_lst = [tags_helper.get_tag_assignment(
    STRATEGY,
    y_pred,
    probability_threshold=THRESHOLD,
    tag_probability_index=PROB_INDEX,
    tag_incidence_index=INCIDENCE_INDEX,
    limit=LIMIT) for y_pred in np.vstack(Y_pred_lst)]

Y_pred = np.vstack(Y_pred_calculated_lst)

Y_val.shape,Y_pred.shape

((6400, 1209), (6400, 1209))

In [44]:
micro_f1 = metrics_helper.calculate_multilabel_metrics(Y_val,Y_pred)
print('micro-averaged F1 score (validation set) is {0}'.format(micro_f1))

micro_prec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='precision')
print('micro-averaged precision score (validation set) is {0}'.format(micro_prec))

micro_rec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='recall')
print('micro-averaged recall score (validation set) is {0}'.format(micro_rec))

micro-averaged F1 score (validation set) is 0.03403197462603403
micro-averaged precision score (validation set) is 0.0248125
micro-averaged recall score (validation set) is 0.054153594325467194
