In [1]:
import pandas
import os
import sys
import numpy as np
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


Using TensorFlow backend.


In [2]:
from bs4 import BeautifulSoup
import re

def cleaner(str):
    soup = BeautifulSoup(str)
    str1 = soup.get_text()
    str1 = str1.replace('\\n', ' ')
    str2 = str1.replace("\\","")
    str3 = str2.replace("("," ")
    str4 = str3.replace(")"," ")
    str5 = re.sub("[0-9]|\.|{|}|\^|;|=|/" , " ", str4)
    return str5

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


In [3]:
BASE_DIR = '../'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')


In [8]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


In [9]:
# second, prepare text samples and their labels
print('Processing text dataset')


df = pandas.read_csv('test2.csv' , low_memory=False)
d = df.to_dict()

docs = []
labels = []
for key2 in d['conceptCode/0']:
	if pandas.isna(d['conceptCode/0'][key2]) or pandas.isna(d['content/0/solutionContent'][key2]) or pandas.isna(d['content/0/questionContent'][key2]) :
		pass
	else:
		labl = cleaner(d['conceptCode/0'][key2]).rstrip()
		if (labl == "P"):
			labels.append(0)
		elif (labl == "C"):
			labels.append(1)
		elif (labl == "M") :
			labels.append(2)
		else : # Current others label ;; Like miscellaneous
			labels.append(3)

		# Need some good string parsing here
		strin = cleaner(d['content/0/solutionContent'][key2]) + " " + cleaner(d['content/0/questionContent'][key2])
		docs.append( strin )


Processing text dataset


In [17]:
print(labels.count(0))
print(labels.count(1))
print(labels.count(2))
print(labels.count(3))

12959
15540
14284
12498


In [18]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Found 42101 unique tokens.
('Shape of data tensor:', (55281, 1000))
('Shape of label tensor:', (55281, 4))


In [19]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


In [20]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.random_sample((100))

Preparing embedding matrix.


In [21]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)


In [22]:
from keras.layers import LSTM


In [23]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(128, dropout=0.2, return_sequences=True)(embedded_sequences)
x = LSTM(128, dropout=0.2, return_sequences=True)(x)
x = LSTM(128, dropout=0.2, return_sequences=False)(x)
x = Dense(128, activation='relu')(x)
preds = Dense(4, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
                optimizer='rmsprop',
                metrics=['acc'])

print(model.summary())


Training model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         2000100   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000, 128)         117248    
_________________________________________________________________
lstm_2 (LSTM)                (None, 1000, 128)         131584    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 516  

In [24]:
model.fit(x_train, y_train,
            batch_size=128,
            epochs=5,
            validation_data=(x_val, y_val))


Train on 44225 samples, validate on 11056 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2993c9da50>

In [25]:
model.save('topic104.h5')
dump(tokenizer, open('Topictokenizer4.pkl', 'wb'))


In [None]:
str = raw_input('Enter Your query to classify\n')

docs = [cleaner(str)]
sequences = tokenizer.texts_to_sequences(docs)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
np.set_printoptions(suppress=True)
output = model.predict(data,verbose=0)

print(output.flatten())

