### classify stackoverflow posts with convolutional networks, with IDF-weighted eord embeddings, then using individual tag probabilities to assign tags to samples

In [17]:
import csv
import os
import re
import sys
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# my stuff in the helpers/ directory
from helpers import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper

In [2]:
SEED=np.random.randint(1,1000)
SEED

59

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small_stanford_tokenized()

In [5]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=32
EMBEDDING_DIM=100
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [8]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [9]:
# tag position => fraction of docs having that tag
# snooping
tag_incidence_index = tags_helper.get_incidence_index(binary_labels)

In [10]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [11]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_
idf_index = dict(zip(vect.get_feature_names(), idf))

embeddings_index = files_helper.read_glove_stackoverflow_weighted(
    EMBEDDING_DIM,
    weight_index=idf_index)

overall, 102180 out of 163036 embeddings were weighted. Total available embeddings: 799210


In [12]:
embedding_matrix = embeddings_helper.build_embedding_matrix(
    word_index, 
    embeddings_index, 
    MAX_NB_WORDS, 
    EMBEDDING_DIM)

In [13]:
embedding_matrix.shape

(221514, 100)

In [14]:
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels_1 = binary_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-num_validation_samples]
Y_train = labels_1[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = labels_1[-num_validation_samples:]

data.shape,labels_1.shape

((32000, 1000), (32000, 1209))

In [19]:
%%time

num_labels = labels_1.shape[1]

model = Sequential()

model.add(Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           trainable = False))

model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128,activation='relu'))
model.add(Dense(num_labels,activation='sigmoid'))
model.compile(loss ='binary_crossentropy',
             optimizer='adam',
             metrics=['acc'])

# loss doesn't get better after 5 epochs
model.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=(X_val,Y_val))

Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 3min 52s, sys: 41.1 s, total: 4min 33s
Wall time: 5min 5s


In [20]:
Y_train_pred = []

for x_train in X_train:
    y_train_pred = model.predict(x_train.reshape(1,-1))
    Y_train_pred.append(y_train_pred)

Y_train_pred = np.vstack(Y_train_pred)

In [21]:
tag_probability_index = tags_helper.get_probability_index(Y_train_pred)

In [22]:
def get_scores(predicted_probabilities, predicted_indices,debug=False):
    """
    predicted_probabilities (output of n sigmoid output units)
    predicted_indices (according to some strategy, e.g. static threshold, etc)
    """
    
    # indices that are turned on (equal 1)
    active_indices = predicted_indices.ravel().nonzero()[0]
    
    if debug:
        print("active_indices: {0}".format(active_indices))
    
    scores_for_active_indices = predicted_probabilities[active_indices]
    
    scores_for_active_indices_as_matrix = scores_for_active_indices.reshape(1,-1)
    
    return scores_for_active_indices_as_matrix
    

In [68]:
# sample result for a couple of test cases
num_test_cases = 10
STRATEGY= 'raw_probability'
THRESHOLD=0.2
PROB_INDEX = tag_probability_index
INCIDENCE_INDEX = tag_incidence_index
LIMIT=5

for i in np.random.randint(low=0, high=len(Y_val), size=num_test_cases):

    actual_label_indices = Y_val[i].reshape(1,-1)
    
#     print(actual_label_indices.ravel().nonzero()[0])
    
    actual_labels = lb.inverse_transform(actual_label_indices)
    actual_labels_tpl = actual_labels[0]
    
    predicted_tag_probabilities = model.predict(X_val[i].reshape(1,-1)).ravel()
              
    predicted_label_indices = tags_helper.get_tag_assignment(
        STRATEGY,
        predicted_tag_probabilities,
        probability_threshold=THRESHOLD,
        tag_probability_index=PROB_INDEX,
        tag_incidence_index=INCIDENCE_INDEX,
        limit=LIMIT)
         
    predicted_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        predicted_label_indices)
    
    actual_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        actual_label_indices)
    
    predicted_label_scores_arr = predicted_label_scores_mat.ravel()
    predicted_labels = lb.inverse_transform(predicted_label_indices)
    predicted_labels_tpl = predicted_labels[0]
      
    predicted_tags_and_scores = sorted(list(zip(predicted_labels_tpl,predicted_label_scores_arr)), key=lambda tpl: tpl[1], reverse=True)
    
    actual_label_scores_arr = actual_label_scores_mat.ravel()
    actual_tags_and_scores = list(zip(actual_labels_tpl,actual_label_scores_arr))

#     print(actual_labels)    
    print(actual_labels_tpl)
    print(actual_tags_and_scores)
    print(predicted_tags_and_scores)
    print('\n')


('asp.net', 'jquery')
[('asp.net', 0.86500221), ('jquery', 0.58499646)]
[('asp.net', 0.86500221), ('jquery', 0.58499646), ('asp.net-mvc', 0.28049606), ('javascript', 0.24019884), ('ajax', 0.21513642)]


('ios', 'objective-c', 'uiscrollview')
[('ios', 0.60343879), ('objective-c', 0.15165217), ('uiscrollview', 0.070708558)]
[('ios', 0.60343879)]


('language-agnostic',)
[('language-agnostic', 0.060039759)]
[('c#', 0.59760648), ('.net', 0.31434503), ('unit-testing', 0.30677679), ('wcf', 0.29693729)]


()
[]
[('android', 0.87314147), ('xamarin.android', 0.35488662), ('xamarin', 0.34827629), ('android-studio', 0.3112618)]


('.net', 'asp.net-membership')
[('.net', 0.05973123), ('asp.net-membership', 0.0037366163)]
[('c#', 0.47495869), ('asp.net', 0.44962323)]


('ruby-on-rails',)
[('ruby-on-rails', 0.61187947)]
[('ruby', 0.62753755), ('ruby-on-rails', 0.61187947), ('erb', 0.28728431)]


('c', 'c++', 'visual-studio-2010')
[('c', 0.070304424), ('c++', 0.14026287), ('visual-studio-2010', 0.007

In [69]:
X_test_list = []

for validation_features in X_val:
    X_test_list.append(validation_features.reshape(1,-1))

X_test = np.vstack(X_test_list)    

In [70]:
Y_pred_lst = [model.predict(x_test.reshape(1,-1)) for x_test in X_test]

In [71]:
Y_pred_calculated_lst = [tags_helper.get_tag_assignment(
    STRATEGY,
    y_pred,
    probability_threshold=THRESHOLD,
    tag_probability_index=PROB_INDEX,
    tag_incidence_index=INCIDENCE_INDEX,
    limit=LIMIT) for y_pred in np.vstack(Y_pred_lst)]

Y_pred = np.vstack(Y_pred_calculated_lst)

Y_val.shape,Y_pred.shape

((6400, 1209), (6400, 1209))

In [72]:
micro_f1 = metrics_helper.calculate_multilabel_metrics(Y_val,Y_pred)
print('micro-averaged F1 score (validation set) is {0}'.format(micro_f1))

micro_prec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='precision')
print('micro-averaged precision score (validation set) is {0}'.format(micro_prec))

micro_rec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='recall')
print('micro-averaged recall score (validation set) is {0}'.format(micro_rec))

micro-averaged F1 score (validation set) is 0.4587737160218494
micro-averaged precision score (validation set) is 0.4394971513500124
micro-averaged recall score (validation set) is 0.47981880873504157
