### classify stackoverflow posts by considering each document a simple mean of its individual word embeddings, then using individual tag probabilities to assign tags to samples

In [1]:
import csv
import os
import re
import sys
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D
from keras.models import Model, Sequential

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import *

# my stuff in the helpers/ directory
from helpers import files_helper, texts_helper, metrics_helper, tags_helper

Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

119

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small()

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM=100
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=1
NUM_EPOCHS=10
TOKENIZER_FILTERS='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
TAG_PROB_THRESHOLD=0.05

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
[inverse_word_index[idx] for idx in sequences[0]]

['apply',
 'onclick',
 'event',
 'to',
 'an',
 'option',
 'i',
 'am',
 'using',
 'zend',
 'form',
 'to',
 'create',
 'a',
 'form',
 'i',
 'am',
 'also',
 'using',
 'mootools',
 'for',
 'javascript',
 'this',
 'gt',
 'radio',
 'alone',
 'array',
 'label',
 'gt',
 'are',
 'you',
 'going',
 'to',
 'be',
 'taking',
 'part',
 'with',
 'anyone',
 'else',
 'required',
 'gt',
 'true',
 'onclick',
 'gt',
 'gt',
 'array',
 'yes',
 'gt',
 'yes',
 'no',
 'gt',
 'no',
 'at',
 'the',
 'moment',
 'the',
 'onclick',
 'event',
 'works',
 'if',
 'any',
 'option',
 'is',
 'selected',
 'how',
 'do',
 'i',
 'get',
 'it',
 'to',
 'work',
 'for',
 'just',
 'yes',
 'being',
 'selected']

In [8]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [9]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [10]:
# tag position => fraction of docs having that tag
tag_probabilities_index = tags_helper.get_probabilities_index(binary_labels)

In [11]:
# word => embedding
embeddings_index = files_helper.read_glove(d=EMBEDDING_DIM)

In [12]:
embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))

for word,i in word_index.items():
    
    if i >= MAX_NB_WORDS:
        continue
    
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
embedding_matrix.shape

(178778, 100)

In [14]:
# unpadded sequences. no call to pad_sequences
data = sequences
num_rows = len(data)

indices = np.arange(num_rows)

data = [ np.array(data[i]) for i in indices]

indices = np.arange(num_rows)

np.random.shuffle(indices)

labels_1 = binary_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * num_rows)

X_train = data[:-num_validation_samples]
Y_train = labels_1[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = labels_1[-num_validation_samples:]

len(X_train),len(X_val),labels_1.shape

(25600, 6400, (32000, 1209))

In [18]:
# DEBUG: lengths of the first 10 documents
seq_lengths_train = [len(seq) for seq in data]
seq_lengths_train[:10]

[76, 209, 103, 92, 560, 143, 171, 148, 1048, 98]

In [27]:
%%time

num_labels = labels_1.shape[1]

model = Sequential()

model.add(Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           trainable = False))
model.add(GlobalAvgPool1D())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam')

for epoch_num in range(10):

    # training in single-example batches
    idxs = np.array(list(range(len(X_train))))
    np.random.shuffle(idxs)
       
    train_losses = []
    val_losses = []    
        
    for i in tqdm(idxs):

        train_x = X_train[i].reshape(1,-1)
        train_y = Y_train[i].reshape(1,-1)
        
        model.train_on_batch(train_x,train_y)
        train_losses.append(model.test_on_batch(train_x,train_y))
        
    # average of validation scores over whole validation set   
    idxs = np.array(list(range(len(X_val))))
    np.random.shuffle(idxs)

        
    for i in tqdm(idxs):   
        val_x = X_val[i].reshape(1,-1)
        val_y = Y_val[i].reshape(1,-1)
        
        
        val_losses.append(model.test_on_batch(val_x,val_y))
    
    train_loss = np.mean(np.array(train_losses))
    val_loss = np.mean(np.array(val_losses))
    
    tqdm.write("after epoch {0}: train_loss: {1}, val_loss: {2}".format(
        epoch_num,
        train_loss,
        val_loss))
            

100%|██████████| 25600/25600 [01:08<00:00, 372.24it/s]
100%|██████████| 6400/6400 [00:06<00:00, 812.64it/s] 
  0%|          | 25/25600 [00:00<01:44, 243.62it/s]

after epoch 0: train_loss: 0.013153879903256893, val_loss: 0.012133252806961536


100%|██████████| 25600/25600 [01:12<00:00, 355.35it/s]
100%|██████████| 6400/6400 [00:05<00:00, 1212.36it/s]
  0%|          | 33/25600 [00:00<01:18, 325.80it/s]

after epoch 1: train_loss: 0.012076538987457752, val_loss: 0.01215445064008236


100%|██████████| 25600/25600 [01:11<00:00, 356.75it/s]
100%|██████████| 6400/6400 [00:05<00:00, 1276.98it/s]
  0%|          | 38/25600 [00:00<01:08, 372.38it/s]

after epoch 2: train_loss: 0.011958181858062744, val_loss: 0.012971908785402775


100%|██████████| 25600/25600 [01:12<00:00, 354.48it/s]
100%|██████████| 6400/6400 [00:05<00:00, 1197.64it/s]
  0%|          | 34/25600 [00:00<01:16, 333.48it/s]

after epoch 3: train_loss: 0.01166074350476265, val_loss: 0.011564414948225021


 22%|██▏       | 5624/25600 [00:16<00:57, 346.64it/s]

KeyboardInterrupt: 

In [28]:
def get_scores(predicted_probabilities, predicted_indices):
    """
    predicted_probabilities (output of n sigmoid output units)
    predicted_indices (according to some strategy, e.g. static threshold, etc)
    """
    
    # indices that are turned on (equal 1)
    active_indices = predicted_indices.ravel().nonzero()[0]
        
    scores_for_active_indices = predicted_probabilities[active_indices]
    
    scores_for_active_indices_as_matrix = scores_for_active_indices.reshape(1,-1)
    
    return scores_for_active_indices_as_matrix
    

In [29]:
# sample result for a couple of test cases
num_test_cases = 10

for i in np.random.randint(low=0, high=len(Y_val), size=num_test_cases):

    actual_label_indices = Y_val[i].reshape(1,-1)
    actual_labels = lb.inverse_transform(actual_label_indices)
    actual_labels_tpl = actual_labels[0]
    
    predicted_tag_probabilities = model.predict(X_val[i].reshape(1,-1)).ravel()
              
    predicted_label_indices = tags_helper.get_predicted_indices_by_threshold(
        predicted_tag_probabilities,
        TAG_PROB_THRESHOLD)
       
#     predicted_label_indices = get_predicted_indices_by_tag_doc_fraction(
#         tag_probabilities_index,
#         predicted_tag_probabilities)
        
#     print(predicted_label_indices.nonzero()[0])
    
    predicted_label_scores_mat = get_scores(predicted_tag_probabilities,
                                       predicted_label_indices)
    
    predicted_label_scores_arr = predicted_label_scores_mat.ravel()
    
    predicted_labels = lb.inverse_transform(predicted_label_indices)

    predicted_labels_tpl = predicted_labels[0]
    
#     print(predicted_label_scores.ravel())
    
    tags_and_scores = sorted(list(zip(predicted_labels_tpl,predicted_label_scores_arr)), key=lambda tpl: tpl[1], reverse=True)
  

    print(actual_labels_tpl)
    print(tags_and_scores)
    print('\n')


('jar', 'java', 'netbeans')
[('javascript', 0.09530472), ('java', 0.091640197), ('php', 0.087819368), ('c#', 0.079533748), ('android', 0.077264994), ('jquery', 0.063962288), ('python', 0.062537163), ('html', 0.051446669)]


('dataframe', 'r')
[('javascript', 0.093118869), ('java', 0.09103056), ('php', 0.085039146), ('c#', 0.078287423), ('android', 0.075730354), ('jquery', 0.063308433), ('python', 0.06063414)]


('animation', 'ios', 'iphone', 'swift3')
[('javascript', 0.090115122), ('java', 0.087930478), ('php', 0.081183553), ('c#', 0.074570678), ('android', 0.073187307), ('jquery', 0.060236402), ('python', 0.058433939)]


()
[('javascript', 0.091660246), ('java', 0.089986801), ('php', 0.082986042), ('c#', 0.076710582), ('android', 0.074333116), ('jquery', 0.062237859), ('python', 0.058679678)]


('go',)
[('javascript', 0.092242248), ('java', 0.09046711), ('php', 0.085395887), ('c#', 0.078096718), ('android', 0.075273827), ('jquery', 0.062632553), ('python', 0.059128549)]


('c#', 'visu

           22%|██▏       | 5624/25600 [00:30<01:46, 187.29it/s]