### classify stackoverflow posts by considering each document a simple mean of its individual word embeddings (glove trained on stackoverflow corpus), then using individual tag probabilities to assign tags to samples

In [1]:
import csv
import os
import re
import sys
import numpy as np

import cProfile

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D
from keras.models import Model, Sequential

from nltk.tokenize import StanfordTokenizer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

# my stuff in the helpers/ directory
from helpers import files_helper, texts_helper, metrics_helper, tags_helper

Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

179

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small_stanford_tokenized()

In [13]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=32
EMBEDDING_DIM=50
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
[inverse_word_index[idx] for idx in sequences[1]]

['replacing',
 'asp',
 'datagrid',
 'control',
 'to',
 'custom',
 'jquery',
 'control',
 '(',
 'telerik',
 ',',
 'or',
 'other',
 'library',
 ')',
 '.',
 'we',
 'have',
 'a',
 'traditional',
 'webform',
 'aspx',
 'application',
 '.',
 'we',
 'are',
 'using',
 'so',
 'many',
 'asp',
 'server',
 'controls',
 'in',
 'our',
 'application',
 'which',
 'in',
 'ui',
 'look',
 'n',
 'feel',
 '.',
 'i',
 'have',
 'been',
 'looking',
 'into',
 'telerik',
 'and',
 'other',
 'asp',
 'custom',
 'control',
 'library',
 '.',
 'we',
 'are',
 'also',
 'looking',
 'for',
 'open',
 'source',
 'library',
 'which',
 'doesnt',
 'come',
 'with',
 'license',
 '.',
 'i',
 'want',
 'to',
 'ask',
 'how',
 'can',
 'we',
 'start',
 'replacing',
 'all',
 'the',
 'asp',
 '.',
 'net',
 'control',
 'to',
 'any',
 'good',
 'ui',
 'custom',
 'control',
 'library',
 '.',
 'as',
 'we',
 'want',
 'to',
 'change',
 'as',
 'much',
 'less',
 'our',
 'code',
 '.',
 'i',
 'need',
 'pointers',
 'to',
 'open',
 'source',
 'asp',


In [8]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [9]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [10]:
# tag position => fraction of docs having that tag
# snooping
tag_incidence_index = tags_helper.get_incidence_index(binary_labels)

In [11]:
# word => embedding
embeddings_index = files_helper.read_glove_stackoverflow()

In [14]:
embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))

for word,i in word_index.items():
    
    if i >= MAX_NB_WORDS:
        continue
    
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
embedding_matrix.shape

(221514, 50)

In [16]:
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)


indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels_1 = binary_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-num_validation_samples]
Y_train = labels_1[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = labels_1[-num_validation_samples:]

data.shape,labels_1.shape

((32000, 1000), (32000, 1209))

In [17]:
%%time

num_labels = labels_1.shape[1]

model = Sequential()

model.add(Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           trainable = True))

model.add(GlobalAvgPool1D())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam')

# loss doesn't get better after 5 epochs
model.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=(X_val,Y_val))

Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 4min 3s, sys: 42.3 s, total: 4min 46s
Wall time: 4min 40s


In [18]:
Y_train_pred = []

for x_train in X_train:
    y_train_pred = model.predict(x_train.reshape(1,-1))
    Y_train_pred.append(y_train_pred)

Y_train_pred = np.vstack(Y_train_pred)

In [19]:
tag_probability_index = tags_helper.get_probability_index(Y_train_pred)

In [20]:
def get_scores(predicted_probabilities, predicted_indices,debug=False):
    """
    predicted_probabilities (output of n sigmoid output units)
    predicted_indices (according to some strategy, e.g. static threshold, etc)
    """
    
    # indices that are turned on (equal 1)
    active_indices = predicted_indices.ravel().nonzero()[0]
    
    if debug:
        print("active_indices: {0}".format(active_indices))
    
    scores_for_active_indices = predicted_probabilities[active_indices]
    
    scores_for_active_indices_as_matrix = scores_for_active_indices.reshape(1,-1)
    
    return scores_for_active_indices_as_matrix
    

In [21]:
lb.transform([set(['c++'])]).ravel().nonzero()[0]

array([142])

In [22]:
i=555

tag_incidence_index[535] # java
tag_incidence_index[1059] # twitter-bootstrap
tag_incidence_index[692] # mysql
tag_incidence_index[i],tag_probability_index[i] # c++

(0.00050000000000000001, 0.00043709378)

In [23]:
# sample result for a couple of test cases
num_test_cases = 10
STRATEGY= 'raw_probability'
THRESHOLD=0.0
PROB_INDEX = tag_probability_index
INCIDENCE_INDEX = tag_incidence_index
LIMIT=5

for i in np.random.randint(low=0, high=len(Y_val), size=num_test_cases):

    actual_label_indices = Y_val[i].reshape(1,-1)
    
#     print(actual_label_indices.ravel().nonzero()[0])
    
    actual_labels = lb.inverse_transform(actual_label_indices)
    actual_labels_tpl = actual_labels[0]
    
    predicted_tag_probabilities = model.predict(X_val[i].reshape(1,-1)).ravel()
              
    predicted_label_indices = tags_helper.get_tag_assignment(
        STRATEGY,
        predicted_tag_probabilities,
        probability_threshold=THRESHOLD,
        tag_probability_index=PROB_INDEX,
        tag_incidence_index=INCIDENCE_INDEX,
        limit=LIMIT)
         
    predicted_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        predicted_label_indices)
    
    actual_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        actual_label_indices)
    
    predicted_label_scores_arr = predicted_label_scores_mat.ravel()
    predicted_labels = lb.inverse_transform(predicted_label_indices)
    predicted_labels_tpl = predicted_labels[0]
      
    predicted_tags_and_scores = sorted(list(zip(predicted_labels_tpl,predicted_label_scores_arr)), key=lambda tpl: tpl[1], reverse=True)
    
    actual_label_scores_arr = actual_label_scores_mat.ravel()
    actual_tags_and_scores = list(zip(actual_labels_tpl,actual_label_scores_arr))

#     print(actual_labels)    
    print(actual_labels_tpl)
    print(actual_tags_and_scores)
    print(predicted_tags_and_scores)
    print('\n')


('.net', 'c#', 'reflection')
[('.net', 0.018170882), ('c#', 0.080765337), ('reflection', 0.00099084817)]
[('javascript', 0.097420081), ('java', 0.096437894), ('android', 0.083209895), ('c#', 0.080765337), ('python', 0.048720963)]


('heroku', 'redis', 'ruby-on-rails')
[('heroku', 0.001593275), ('redis', 0.00076053332), ('ruby-on-rails', 0.021641981)]
[('javascript', 0.12611869), ('php', 0.078404635), ('c#', 0.061392266), ('python', 0.058223628), ('java', 0.057592258)]


('postgresql',)
[('postgresql', 0.0042420845)]
[('javascript', 0.12511379), ('php', 0.075365968), ('c#', 0.061400376), ('java', 0.058919415), ('python', 0.056080882)]


('sql-server-2008-r2', 'tsql')
[('sql-server-2008-r2', 0.00079613377), ('tsql', 0.0025349618)]
[('javascript', 0.10653213), ('java', 0.083808847), ('c#', 0.075967059), ('android', 0.070427403), ('php', 0.05675137)]


('android', 'image', 'xml')
[('android', 0.062301196), ('image', 0.0042317118), ('xml', 0.0089169191)]
[('javascript', 0.11036478), ('java'

In [24]:
X_test_list = []

for validation_features in X_val:
    X_test_list.append(validation_features.reshape(1,-1))

X_test = np.vstack(X_test_list)

In [25]:
Y_pred_lst = [model.predict(x_test.reshape(1,-1)) for x_test in X_test]

In [26]:
Y_pred_calculated_lst = [tags_helper.get_tag_assignment(
    STRATEGY,
    y_pred,
    probability_threshold=THRESHOLD,
    tag_probability_index=PROB_INDEX,
    tag_incidence_index=INCIDENCE_INDEX,
    limit=LIMIT) for y_pred in np.vstack(Y_pred_lst)]

Y_pred = np.vstack(Y_pred_calculated_lst)

Y_val.shape,Y_pred.shape

((6400, 1209), (6400, 1209))

In [27]:
micro_f1 = metrics_helper.calculate_multilabel_metrics(Y_val,Y_pred)
print('micro-averaged F1 score (validation set) is {0}'.format(micro_f1))

micro_prec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='precision')
print('micro-averaged precision score (validation set) is {0}'.format(micro_prec))

micro_rec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='recall')
print('micro-averaged recall score (validation set) is {0}'.format(micro_rec))

micro-averaged F1 score (validation set) is 0.13895420924366153
micro-averaged precision score (validation set) is 0.1015625
micro-averaged recall score (validation set) is 0.21992150493977533
