### classify stackoverflow posts using tf-idf features, using neural nets

- note: use the same tokenizer rules as Keras

In [1]:
import csv
import os
import re
import sys
import numpy as np
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D
from keras.models import Model, Sequential

from sklearn import preprocessing, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.pipeline import Pipeline

# my stuff in the helpers/ directory
from helpers import files_helper, texts_helper, metrics_helper, tags_helper

Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

694

In [3]:
np.random.seed(seed=SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small()     

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM=100
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=32
NUM_EPOCHS=10
TOKENIZER_FILTERS='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [8]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [9]:
# tag position => fraction of docs having that tag
# snooping
tag_incidence_index = tags_helper.get_incidence_index(binary_labels)

In [10]:
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [11]:
data = tokenized_texts
indices = np.arange(len(data))
np.random.shuffle(indices)

labels_1 = binary_labels[indices]

data = [data[i] for i in indices]
num_validation_samples = int(VALIDATION_SPLIT * len(data))

data_train = data[:-num_validation_samples]
data_val = data[-num_validation_samples:]

In [12]:
# TfidfVectorizer = CountVectorizer + TFidfTransformer

# does this configure snooping?
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(data)
X_train = vect.transform(data_train)
X_val = vect.transform(data_val)

Y_train = labels_1[:-num_validation_samples]
Y_val = labels_1[-num_validation_samples:]

X_train = X_train.todense()
X_val = X_val.todense()

X_train.shape,X_val.shape,Y_train.shape,Y_val.shape

((25600, 19843), (6400, 19843), (25600, 1209), (6400, 1209))

In [27]:
model = Sequential()

model.add(Dense(64,activation='relu',input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(Y_train.shape[1], activation='sigmoid'))

model.compile(loss ='binary_crossentropy',
             optimizer='adam')

In [28]:
%%time

model.fit(X_train,Y_train, validation_data=(X_val, Y_val),
         epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min 46s, sys: 15.7 s, total: 2min 2s
Wall time: 1min 50s


<keras.callbacks.History at 0x7f20846533c8>

In [29]:
Y_train_pred = []

for x_train in X_train:
    y_train_pred = model.predict(x_train.reshape(1,-1))
    Y_train_pred.append(y_train_pred)

Y_train_pred = np.vstack(Y_train_pred)

In [30]:
tag_probability_index = tags_helper.get_probability_index(Y_train_pred)

In [31]:
def get_scores(predicted_probabilities, predicted_indices,debug=False):
    """
    predicted_probabilities (output of n sigmoid output units)
    predicted_indices (according to some strategy, e.g. static threshold, etc)
    """
    
    # indices that are turned on (equal 1)
    active_indices = predicted_indices.ravel().nonzero()[0]
    
    if debug:
        print("active_indices: {0}".format(active_indices))
    
    scores_for_active_indices = predicted_probabilities[active_indices]
    
    scores_for_active_indices_as_matrix = scores_for_active_indices.reshape(1,-1)
    
    return scores_for_active_indices_as_matrix

In [32]:
# sample result for a couple of test cases
num_test_cases = 10
STRATEGY='raw_probability'
THRESHOLD=0.0
PROB_INDEX = tag_probability_index
INCIDENCE_INDEX = tag_incidence_index
LIMIT=5

for i in np.random.randint(low=0, high=len(Y_val), size=num_test_cases):

    actual_label_indices = Y_val[i].reshape(1,-1)
    
#     print(actual_label_indices.ravel().nonzero()[0])
    
    actual_labels = lb.inverse_transform(actual_label_indices)
    actual_labels_tpl = actual_labels[0]
    
    predicted_tag_probabilities = model.predict(X_val[i].reshape(1,-1)).ravel()
              
    predicted_label_indices = tags_helper.get_tag_assignment(
        STRATEGY,
        predicted_tag_probabilities,
        probability_threshold=THRESHOLD,
        tag_probability_index=PROB_INDEX,
        tag_incidence_index=INCIDENCE_INDEX,
        limit=LIMIT)
         
    predicted_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        predicted_label_indices)
    
    actual_label_scores_mat = get_scores(
        predicted_tag_probabilities,
        actual_label_indices)
    
    predicted_label_scores_arr = predicted_label_scores_mat.ravel()
    predicted_labels = lb.inverse_transform(predicted_label_indices)
    predicted_labels_tpl = predicted_labels[0]
      
    predicted_tags_and_scores = sorted(list(zip(predicted_labels_tpl,predicted_label_scores_arr)), key=lambda tpl: tpl[1], reverse=True)
    
    actual_label_scores_arr = actual_label_scores_mat.ravel()
    actual_tags_and_scores = list(zip(actual_labels_tpl,actual_label_scores_arr))

#     print(actual_labels)    
    print(actual_labels_tpl)
    print(actual_tags_and_scores)
    print(predicted_tags_and_scores)
    print('\n')


('mysql', 'sql')
[('mysql', 0.045405958), ('sql', 0.030455803)]
[('c#', 0.1007242), ('php', 0.09106537), ('javascript', 0.081930041), ('mysql', 0.045405958), ('java', 0.032797832)]


('jmeter', 'sockets', 'tcp')
[('jmeter', 0.0010490319), ('sockets', 0.00269819), ('tcp', 0.0015959966)]
[('java', 0.10699419), ('c#', 0.10270346), ('python', 0.069945998), ('android', 0.052568179), ('c++', 0.03743761)]


('html', 'python', 'python-3.x', 'utf-8')
[('html', 0.012556964), ('python', 0.046015762), ('python-3.x', 0.0018797769), ('utf-8', 0.00074715132)]
[('c#', 0.10835931), ('php', 0.063400105), ('java', 0.052590553), ('python', 0.046015762), ('javascript', 0.042629264)]


('sharepoint-2010',)
[('sharepoint-2010', 0.00046313854)]
[('c#', 0.11171436), ('java', 0.071123652), ('python', 0.058841936), ('php', 0.043891869), ('.net', 0.030625902)]


('html', 'javascript')
[('html', 0.26131648), ('javascript', 0.41209)]
[('javascript', 0.41209), ('jquery', 0.40793902), ('html', 0.26131648), ('css', 0.

In [33]:
X_test_list = []

for validation_features in X_val:
    X_test_list.append(validation_features.reshape(1,-1))

X_test = np.vstack(X_test_list)    

In [34]:
Y_pred_lst = [model.predict(x_test.reshape(1,-1)) for x_test in X_test]

In [35]:
Y_pred_calculated_lst = [tags_helper.get_tag_assignment(
    STRATEGY,
    y_pred,
    probability_threshold=THRESHOLD,
    tag_probability_index=PROB_INDEX,
    tag_incidence_index=INCIDENCE_INDEX,
    limit=LIMIT) for y_pred in np.vstack(Y_pred_lst)]

Y_pred = np.vstack(Y_pred_calculated_lst)

Y_val.shape,Y_pred.shape

((6400, 1209), (6400, 1209))

In [36]:
micro_f1 = metrics_helper.calculate_multilabel_metrics(Y_val,Y_pred)
print('micro-averaged F1 score (validation set) is {0}'.format(micro_f1))

micro_prec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='precision')
print('micro-averaged precision score (validation set) is {0}'.format(micro_prec))

micro_rec = metrics_helper.calculate_multilabel_metrics(Y_val, Y_pred, metric='recall')
print('micro-averaged recall score (validation set) is {0}'.format(micro_rec))

micro-averaged F1 score (validation set) is 0.18318453336758944
micro-averaged precision score (validation set) is 0.1336875
micro-averaged recall score (validation set) is 0.2908818929761338
