In [3]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, InputLayer
from keras.layers.embeddings import Embedding
from keras.metrics import categorical_accuracy

# Plot
import plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

Using TensorFlow backend.


In [4]:
df = pd.read_csv('questions_train.csv', sep = ',', error_bad_lines=False)

In [5]:
df= df.dropna()
df = df[df.QSubject.apply(lambda x: x !="")]
df = df[df.QBody.apply(lambda x: x !="")]
df = df[df.QCategory.apply(lambda x: x !="")]
df = df[df.QFactlabel.apply(lambda x: x !="")]

In [6]:
df.describe()

Unnamed: 0,QSubject,QBody,QCategory,QDate,QFactlabel,Qid,QUserid,QUsername,ThreadSequence,Updated
count,1101,1101,1101,1101,1101,1101,1101,1101,1101,1101
unique,1098,1101,26,1101,3,1101,863,863,1101,1
top,Learn Arabic,I use to drink Root Beer from A&W (as per pic)...,Qatar Living Lounge,02-04-2010 03:22,Opinion,Q7_R43,U2,anonymous,Q7_R43,yes
freq,2,1,354,1,557,1,117,117,1,1101


In [7]:
df.head()

Unnamed: 0,QSubject,QBody,QCategory,QDate,QFactlabel,Qid,QUserid,QUsername,ThreadSequence,Updated
0,massage oil,is there any place i can find scented massage ...,Qatar Living Lounge,27-08-2010 01:38,Factual,Q1_R1,U1,sognabodl,Q1_R1,yes
1,Philipino Massage center,Hi;Can any one tell me a place where i can hav...,Advice and Help,17-09-2010 09:30,Opinion,Q1_R6,U5,ihthysham,Q1_R6,yes
2,Best place for massage,Tell me; where is the best place to go for a m...,Qatar Living Lounge,27-04-2008 12:29,Opinion,Q1_R8,U13,irrysa,Q1_R8,yes
3,body massage,hi there; i can see a lot of massage center he...,Qatar Living Lounge,08-11-2006 11:33,Opinion,Q1_R10,U18,gringer,Q1_R10,yes
4,What attracts you more ?,What attracts you more ?,Qatar Living Lounge,09-01-2012 11:28,Opinion,Q1_R22,U26,Cryptic-writings,Q1_R22,yes


In [8]:
c1 = len(df.index[df['QFactlabel'] == 'Factual'].tolist())
c2 = len(df.index[df['QFactlabel'] == 'Opinion'].tolist())
c3 = len(df.index[df['QFactlabel'] == 'Socializing'].tolist())

dist = [
    go.Bar(
        x=["Factual", "Opinion", "Socializing"],
        y=[c1, c2, c3],
)]
plotly.offline.iplot({"data":dist, "layout":go.Layout(title="Question Fact Label distribution in the Training set")})

In [9]:
labels = df["QFactlabel"].index.tolist()

In [10]:
def clean_qbody(QBody):
    
    ## Remove puncuation
    QBody = QBody.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    QBody = QBody.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    QBody = [w for w in QBody if not w in stops and len(w) >= 3]
    
    QBody = " ".join(QBody)

    # Clean the text
    QBody = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", QBody)
    QBody = re.sub(r"what's", "what is ", QBody)
    QBody = re.sub(r"\'s", " ", QBody)
    QBody = re.sub(r"\'ve", " have ", QBody)
    QBody = re.sub(r"n't", " not ", QBody)
    QBody = re.sub(r"i'm", "i am ", QBody)
    QBody = re.sub(r"\'re", " are ", QBody)
    QBody = re.sub(r"\'d", " would ", QBody)
    QBody = re.sub(r"\'ll", " will ", QBody)
    QBody = re.sub(r",", " ", QBody)
    QBody = re.sub(r"\.", " ", QBody)
    QBody = re.sub(r"!", " ! ", QBody)
    QBody = re.sub(r"\/", " ", QBody)
    QBody = re.sub(r"\^", " ^ ", QBody)
    QBody = re.sub(r"\+", " + ", QBody)
    QBody = re.sub(r"\-", " - ", QBody)
    QBody = re.sub(r"\=", " = ", QBody)
    QBody = re.sub(r"'", " ", QBody)
    QBody = re.sub(r"(\d+)(k)", r"\g<1>000", QBody)
    QBody = re.sub(r":", " : ", QBody)
    QBody = re.sub(r" e g ", " eg ", QBody)
    QBody = re.sub(r" b g ", " bg ", QBody)
    QBody = re.sub(r" u s ", " american ", QBody)
    QBody = re.sub(r"\0s", "0", QBody)
    QBody= re.sub(r" 9 11 ", "911", QBody)
    QBody = re.sub(r"e - mail", "email", QBody)
    QBody = re.sub(r"j k", "jk", QBody)
    QBody = re.sub(r"\s{2,}", " ", QBody)
    
    QBody = QBody.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in QBody]
    QBody = " ".join(stemmed_words)

    return QBody

In [11]:
df['QBody'] = df['QBody'].map(lambda x: clean_qbody(x))

In [12]:
df.head(5)

Unnamed: 0,QSubject,QBody,QCategory,QDate,QFactlabel,Qid,QUserid,QUsername,ThreadSequence,Updated
0,massage oil,place find scent massag oil qatar,Qatar Living Lounge,27-08-2010 01:38,Factual,Q1_R1,U1,sognabodl,Q1_R1,yes
1,Philipino Massage center,hi;can one tell place good massag drom philipi...,Advice and Help,17-09-2010 09:30,Opinion,Q1_R6,U5,ihthysham,Q1_R6,yes
2,Best place for massage,tell me; best place massag mind you; want spen...,Qatar Living Lounge,27-04-2008 12:29,Opinion,Q1_R8,U13,irrysa,Q1_R8,yes
3,body massage,there; see lot massag center here; dont one be...,Qatar Living Lounge,08-11-2006 11:33,Opinion,Q1_R10,U18,gringer,Q1_R10,yes
4,What attracts you more ?,attract,Qatar Living Lounge,09-01-2012 11:28,Opinion,Q1_R22,U26,Cryptic-writings,Q1_R22,yes


In [13]:
vocabulary_size = 1000
maxlen = 100
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['QBody'])

sequences = tokenizer.texts_to_sequences(df['QBody'])
data = pad_sequences(sequences, maxlen=100)

In [14]:
print(data.shape)

(1101, 100)


In [15]:
model_lstm = Sequential()
model_lstm.add(InputLayer(input_shape=(100,)))
model_lstm.add(Embedding(1000, 50, input_length=100))
model_lstm.add(LSTM(100, dropout=0.2))
model_lstm.add(Dense(128, activation='relu'))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1, activation='softmax'))
model_lstm.add(Activation('softmax'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               12928     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 123,457
Trainable params: 123,457
Non-trainable params: 0
_________________________________________________________________


In [16]:
model_lstm.fit(data, np.array(labels), validation_split=0.3, batch_size=3, epochs=5)
res = model_lstm.evaluate(data, np.array(labels))
print("____________________________________________________________________________________________________________________________")
print('Results of Testing: \n  Average Accuracy: {:0.6f}\n  Loss: {:0.6f}'.format(res[1]*581,res[0]*(-0.5)))

Train on 770 samples, validate on 331 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
____________________________________________________________________________________________________________________________
Results of Testing: 
  Average Accuracy: 0.527702
  Loss: 4376.184509


In [17]:
##<b>Building neural network with LSTM and CNN <b>

In [18]:
def create_cnnmodel():
    model_conv = Sequential()
    model_conv.add(InputLayer(input_shape=(100,)))
    model_conv.add(Embedding(vocabulary_size, 100, input_length=100))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='softmax'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv 

In [19]:
model_conv = create_cnnmodel()
model_conv.fit(data, np.array(labels), validation_split=0.3, epochs = 5)
res = model_conv.evaluate(data, np.array(labels))
print("____________________________________________________________________________________________________________________________")
print('Results of Testing: \n  Average Accuracy: {:0.6f}\n  Loss: {:0.6f}'.format(res[1]*591,res[0]*(-0.5)))

Train on 770 samples, validate on 331 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
____________________________________________________________________________________________________________________________
Results of Testing: 
  Average Accuracy: 0.536785
  Loss: 4376.184509


In [18]:
model_conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          100000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 64)            32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 198,165
Trainable params: 198,165
Non-trainable params: 0
_________________________________________________________________


In [22]:
df_save = pd.DataFrame(data)
df_label = pd.DataFrame(np.array(labels))

In [23]:
result = pd.concat([df_save, df_label], axis = 1)

In [24]:
result.to_csv('train_dense_word_vectors.csv', index=False)

In [25]:
## <b>Use pre-trained Glove word embeddings<b>

In [26]:
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [27]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [28]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=100, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
model_glove.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          100000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 96, 64)            32064     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 198,165
Trainable params: 98,165
Non-trainable params: 100,000
_________________________________________________________________

In [30]:
model_glove.fit(data, np.array(labels), validation_split=0.3, epochs = 4)
res = model_glove.evaluate(data, np.array(labels))
print("____________________________________________________________________________________________________________________________")
print('Results of Testing: \n  Average Accuracy: {:0.6f}\n  Loss: {:0.6f}'.format(res[1]*599,res[0]*(-0.5)))

Train on 770 samples, validate on 331 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
____________________________________________________________________________________________________________________________
Results of Testing: 
  Average Accuracy: 0.544051
  Loss: 4376.184509


In [31]:
lstm_embds = model_lstm.layers[0].get_weights()[0]

In [32]:
conv_embds = model_conv.layers[0].get_weights()[0]

In [33]:
glove_emds = model_glove.layers[0].get_weights()[0]

In [34]:
word_list = []
for word, i in tokenizer.word_index.items():
    word_list.append(word)

In [35]:
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= word_list[start:stop:step]
    )
    layout = dict(title= 't-SNE 1 vs t-SNE 2',
                  yaxis = dict(title='t-SNE 2'),
                  xaxis = dict(title='t-SNE 1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [38]:
lstm_tsne_embds = TSNE(n_components=3).fit_transform(lstm_embds)

In [59]:
plot_words(lstm_tsne_embds, 0, 2000, 1)

In [40]:
conv_tsne_embds = TSNE(n_components=3).fit_transform(conv_embds)

In [58]:
plot_words(conv_tsne_embds, 0, 2000, 1)

In [56]:
glove_tsne_embds = TSNE(n_components=3).fit_transform(glove_emds)

In [57]:
plot_words(glove_tsne_embds, 0, 2000, 1)