In [1]:


import pandas
import os
import sys
import numpy as np
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


Using TensorFlow backend.


In [2]:
from bs4 import BeautifulSoup
import re

def cleaner(str):
    soup = BeautifulSoup(str)
    str1 = soup.get_text()
    str1 = str1.replace('\\n', ' ')
    str2 = str1.replace("\\","") # str2 = str1.replace("\\"," ") # remove non-ascii???
    str3 = str2.replace("("," ")
    str4 = str3.replace(")"," ")
    str5 = re.sub("[0-9]|\.|{|}|\^|;|=|/|'" , " ", str4)
    return str5

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


In [3]:

# In[3]:


BASE_DIR = '../'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')


# In[8]:


print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


# In[9]:


# second, prepare text samples and their labels
print('Processing text dataset')


df = pandas.read_csv('test2.csv' , low_memory=False)
d = df.to_dict()

docs = []
labels = []
for key2 in d['conceptCode/0']:
	if pandas.isna(d['conceptCode/0'][key2]) or pandas.isna(d['content/0/solutionContent'][key2]) or pandas.isna(d['content/0/questionContent'][key2]) :
		pass
	else:
		labl = cleaner(d['conceptCode/0'][key2]).rstrip()
		if (labl == "P"):
			labels.append(0)
		elif (labl == "C"):
			labels.append(1)
		elif (labl == "M") :
			labels.append(2)
		else : # Current others label ;; Like miscellaneous
			labels.append(3)

		# Need some good string parsing here
		strin = cleaner(d['content/0/solutionContent'][key2]) + " " + cleaner(d['content/0/questionContent'][key2])
		docs.append( strin )


# In[17]:


print(labels.count(0))
print(labels.count(1))
print(labels.count(2))
print(labels.count(3))


# In[18]:


tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


# In[19]:


indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


Indexing word vectors.
Found 400000 word vectors.
Processing text dataset
12959
15540
14284
12498
Found 42101 unique tokens.
('Shape of data tensor:', (55281, 1000))
('Shape of label tensor:', (55281, 4))


In [8]:
# In[20]:


print('Preparing embedding matrix.')

# prepare embedding matrix
i1 = 0
j = 0

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        i1 = i1 + 1
    else:
        j = j + 1
        if j % 100 == 0:
            print(repr(word))
        embedding_matrix[i] = np.random.random_sample((100))


Preparing embedding matrix.
u'searle\u2019s'
u'\xa0one'
u'\u21d2xge'
u'are\xa0'
u'\xd7pleft'
u'dyright'
u'hclmathrel'
u'\xa0friday\xa0'
u'professionalcityprofessionabhubaneshwarpharmacistbhyderbadprofessorcmumbaiartistdbangaloreengineereahmedabadlawyerfchennaidoctorgjaipurcounsellor'
u'tonoplast'
u'equation\xa0of'
u'hchoxrightarrow'
u'food\u2019'
u'\u2013oh'
u'solutionright'
u'ixy'
u'rqp'
u'bsqrt'
u'\xa0\xa0force'
u'negativethinmathspace'
u'following\xa0'
u'tan\xa0'
u'e\xd7'
u'friction\xa0'
u'product\xa0'
u'\u201ct\u201d'
u'dtimes'
u'rtleft'
u'there\u2019'
u's\u2019p'
u'millicurie'
u'pteridophyta'
u'saraswativijayam'
u"q'"
u'brownright'
u'oxright'
u'abcabc\u2026'
u'y\xe2\xb1'
u"mv'"
u'x\xa0'
u"avogadro's"
u'rrightarrow'
u'ikfm'
u'xln'
u'epropto'
u'ii\xa0'
u'\xa0iv'
u'isspace'
u'officerfqclerkgpresearch'
u'beena\u2019s'
u'\xa0\xa0s'
u'abqr'
u'sulphates'
u'balpha'
u'eadcb'
u'\xe2\xb1left'
u'iright'
u'\xa0hangs'
u'ktimesfrac'
u'statements\xa0is'
u'voso'
u'borazine'
u'pwidehat'
u'ocy'
u'ms

In [5]:
print(j)

6799


In [7]:
print(i1)

13201
