In [1]:
"""
Date : 14/06/2018
Time : 10:35
Version : 1.1
Dataset : Deutsch lernen
          Glove 6B(https://nlp.stanford.edu/projects/glove/)
Description : Performing Text Classification using Keras for different deep neural architectures.

Project Outline: 
1) Pre-processing of the data.
2) Build neural network with CNN
3) Build neural network with LSTM
4) Build neural network with LSTM and CNN
5) Use pre-trained GloVe word embeddings 
6) Use pre-trained Word2Vec word embeddings -- Pending
7) Visualization of the embeddings -- Pending
"""

'\nDate : 14/06/2018\nTime : 10:35\nVersion : 1.1\nDataset : IMDB dataset(https://www.kaggle.com/c/word2vec-nlp-tutorial/data)\n          Glove 6B(https://nlp.stanford.edu/projects/glove/)\nDescription : Performing Text Classification using Keras for different deep neural architectures.\n\nProject Outline: \n1) Pre-processing of the data.\n2) Build neural network with CNN\n3) Build neural network with LSTM\n4) Build neural network with LSTM and CNN\n5) Use pre-trained GloVe word embeddings\n6) Use pre-trained Word2Vec word embeddings\n'

In [43]:
# Installing dependencies
"""
! pip install numpy
! pip install pandas
! pip install nltk
! pip install keras
! pip install matplotlib
! pip install sklearn
! pip install plotly
! pip install gensim
"""

'\n! pip install numpy\n! pip install pandas\n! pip install nltk\n! pip install keras\n! pip install matplotlib\n! pip install sklearn\n! pip install plotly\n! pip install gensim\n'

In [91]:
# Importing libraries
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input, Merge
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical
from keras.models import Model,load_model

from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /home/vageesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Pre-processing of data

In [4]:
# Loading dataset
df = pd.read_hdf("dw.hdf5","text_df")[['text','y']]
# Dropping null values
df.dropna(inplace=True)
# Converting class labels to int dtype
df['y'] = df['y'].astype(int)
print("Shape of the dataset:",df.shape)

Shape of the dataset: (7814, 2)


In [5]:
# Cleaning the text data
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("german"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)  
    text = text.split()
    stemmer = SnowballStemmer('german')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [6]:
df['text'] = df['text'].map(lambda x: clean_text(x))
df.head()

Unnamed: 0,text,y
0,us-pr sident barack obama 250 zus tzlich solda...,1
1,us-pr sident barack obama montag seit kanzleri...,1
2,rechtspopulist fp bundespr sidentenwahl sterre...,1
3,deutschland zahl fl chtling maghreb-staat alge...,1
4,beid abgeschlag pr sidentschaftskandidat us-re...,1


In [7]:
# Because of the computational expenses, I am using the top 20000 unique words. 
# At first, the text is tokenized and then convert those into sequences. 
# I have kept 50 words to limit the number of words in each comment.
vocabulary_size = 20000

# Initializing Tokenizer from keras
tokenizer = Tokenizer(num_words= vocabulary_size)
# Fitting text on the tokenizer
tokenizer.fit_on_texts(df['text'])
# Converting text to sequence
sequences = tokenizer.texts_to_sequences(df['text'])

# Finding unique tokenizer
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Padding sequences to the length of MAX_SEQUENCE_LENGTH
data = pad_sequences(sequences, maxlen=50)
labels = df['y']

Found 49565 unique tokens.


# Splitting the training and testing data

In [11]:
# Getting the labels and features data
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (7814, 50)
Shape of label tensor: (7814, 2)


In [82]:
# Getting the splitting index for training and testing data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
validation_samples = int(SPLIT_RATIO * data.shape[0])

In [83]:
# Getting the testing and training dataset
X_train = data[:-validation_samples]
y_train = labels[:-validation_samples]
X_test = data[-validation_samples:]
y_test = labels[-validation_samples:]

# Text Classification using Using Convolutional neural network with multiple filter sizes(Accuracy = 98.16%)
# (https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)

The network starts with an embedding layer. The layer lets the system expand each token to a more massive vector, allowing the network to represent a word in a meaningful way. The layer takes 20000 as the first argument, which is the size of our vocabulary, and 100 as the second input parameter, which is the dimension of the embeddings. The third parameter is the input_length of 50, which is the length of each text sequence.

In [95]:
# defining the globals
max_input_length = 50
vocabulary_size = 20000
embedding_dim = 100

In [96]:
filter_sizes = (2,4,5,8)
dropout_prob = [0.4,0.5]

In [97]:
# Setting the Convolution layer
graph_in = Input(shape=(max_input_length, embedding_dim))
convs = []
avgs = []

for fsz in filter_sizes:
    conv = Conv1D(nb_filter=32,filter_length=fsz,border_mode='valid',activation='relu',subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length = max_input_length - fsz + 1)(conv)
    flattenMax = Flatten()(pool)
    convs.append(flattenMax)

if len(filter_sizes)>1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out, name="graphModel")

In [99]:
# Un-comment the below mentioned code to train your model

"""# Configuring the neural network
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=vocabulary_size, output_dim = embedding_dim,input_length = max_input_length,trainable=True))
model_cnn.add(Dropout(dropout_prob[0]))
model_cnn.add(graph)
model_cnn.add(Dense(128))
model_cnn.add(Dropout(dropout_prob[1]))
model_cnn.add(Activation('relu'))
model_cnn.add(Dense(2))
model_cnn.add(Activation('softmax'))
model_cnn.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
print("model fitting - CNN network")
model_cnn.summary()
# Training the model
model_cnn.fit(X_train,y_train,validation_data=(X_test, y_test),epochs=3)
# Saving the model
model_cnn.save("simple_cnn.h5")"""

model fitting - CNN network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 50, 100)           2000000   
_________________________________________________________________
dropout_23 (Dropout)         (None, 50, 100)           0         
_________________________________________________________________
graphModel (Model)           (None, 128)               60928     
_________________________________________________________________
dense_21 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_24 (Dropout)         (None, 128)               0         
_________________________________________________________________
activation_3 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 2)          

In [100]:
# Loading the pre-trained model
model_cnn = load_model("simple_cnn.h5")

In [101]:
# Checking the accuracy
accuracy_model_cnn = model_cnn.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (accuracy_model_cnn[1]*100))

Accuracy: 98.16%


In [102]:
# Making predictions in terms of probabilities for each class
prediction_model_cnn = model_cnn.predict(X_test,batch_size=10,verbose=0)

In [103]:
prediction_model_cnn

array([[1.3606980e-07, 9.9999988e-01],
       [5.2298433e-03, 9.9477011e-01],
       [4.6578543e-07, 9.9999952e-01],
       ...,
       [4.6887048e-06, 9.9999535e-01],
       [1.6340357e-05, 9.9998367e-01],
       [7.2470340e-07, 9.9999928e-01]], dtype=float32)

# Text Classification Using LSTM (Accuracy = 97.34%)
# (https://arxiv.org/abs/1607.02501)

In [15]:
# Un-comment the below mentioned code to train your model

# Configuring the neural network
model_lstm  = Sequential()
model_lstm.add(Embedding(vocabulary_size, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(2, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("model fitting - LSTM network")
model_lstm.summary()
# Training the model
model_lstm.fit(X_train,y_train,validation_data=(X_test, y_test),epochs=3)
# Saving the model
model_lstm.save("simple_lstm.h5")

model fitting - simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 2,080,602
Trainable params: 2,080,602
Non-trainable params: 0
_________________________________________________________________
Train on 5861 samples, validate on 1953 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
# Loading the pre-trained model
model_lstm = load_model("simple_lstm.h5")

In [17]:
# Checking the accuracy
accuracy_model_lstm = model_lstm.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (accuracy_model_lstm[1]*100))

Accuracy: 97.34%


In [18]:
# Making predictions in terms of probabilities for each class
prediction_simple_cnn = model_lstm.predict(X_test,batch_size=10,verbose=0)

In [19]:
prediction_simple_cnn

array([[4.2485970e-04, 9.9957520e-01],
       [9.0288097e-04, 9.9909711e-01],
       [6.0170150e-04, 9.9939835e-01],
       ...,
       [6.4834143e-04, 9.9935168e-01],
       [2.6833671e-04, 9.9973160e-01],
       [5.4877624e-04, 9.9945122e-01]], dtype=float32)

# Text classification using convolutional layer on top of the LSTM layer(Accuracy = 97.90%)
# (https://arxiv.org/pdf/1511.08630.pdf)

In [None]:
"""
Adding an one-dimensional CNN and max pooling layers after the Embedding layer which is then feed the 
consolidated features to the LSTM unit(to speed up the training proccess)
"""

In [25]:
# Un-comment the below mentioned code to train your model

# Configuring the neural network
"""model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(LSTM(100))
model_conv.add(Dense(2, activation='softmax'))
model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("model fitting - CNN-LSTM convolutional neural network")
model_lstm.summary()
# Training the model
model_conv.fit(X_train, y_train,validation_data=(X_test,y_test),epochs=3)
# Saving the model
model_conv.save("lstm_cnn.h5")"""

model fitting - CNN-LSTM convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 2,080,602
Trainable params: 2,080,602
Non-trainable params: 0
_________________________________________________________________
Train on 5861 samples, validate on 1953 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# Loading the pre-trained model
model_conv = load_model("lstm_cnn.h5")

In [33]:
# Checking the accuracy
accuracy_model_conv = model_conv.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (accuracy_model_conv[1]*100))

Accuracy: 97.90%


In [34]:
# Making predictions in terms of probabilities for each class
prediction_model_conv = model_conv.predict(X_test,batch_size=10,verbose=0)

In [35]:
prediction_model_conv

array([[2.7232425e-05, 9.9997282e-01],
       [2.8722723e-05, 9.9997127e-01],
       [4.4859324e-05, 9.9995518e-01],
       ...,
       [2.2324391e-05, 9.9997771e-01],
       [1.6313435e-05, 9.9998367e-01],
       [1.5929132e-04, 9.9984074e-01]], dtype=float32)

# Using pre-trained Glove word embeddings

In [30]:
# Get embeddings from Glove
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [31]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

# Using architecture having convolutional layer on top of the LSTM layer along with the globe word embeddings(Accuracy = 93.96%)

In [41]:
# Un-comment the below mentioned code to train your model

# Configuring the neural network
"""model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(2, activation='softmax'))
model_glove.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("model fitting - CNN-LSTM convolutional neural network with globe word embeddings")
model_glove.summary()
# Training the model
model_glove.fit(X_train, y_train,validation_data=(X_test,y_test),epochs=3)
# Saving the model
model_glove.save("lstm_cnn_globe.h5")"""

'model_glove = Sequential()\nmodel_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))\nmodel_glove.add(Dropout(0.2))\nmodel_glove.add(Conv1D(64, 5, activation=\'relu\'))\nmodel_glove.add(MaxPooling1D(pool_size=4))\nmodel_glove.add(LSTM(100))\nmodel_glove.add(Dense(2, activation=\'softmax\'))\nmodel_glove.compile(loss=\'categorical_crossentropy\', optimizer=\'adam\', metrics=[\'accuracy\'])\nprint("model fitting - CNN-LSTM convolutional neural network with globe word embeddings")\nmodel_glove.summary()\n# Training the model\nmodel_glove.fit(X_train, y_train,validation_data=(X_test,y_test),epochs=3)\n# Saving the model\nmodel_glove.save("lstm_cnn_globe.h5")'

In [37]:
# Loading the pre-trained model
model_glove_lstm_cnn = load_model("lstm_cnn_globe.h5")

In [38]:
# Checking the accuracy
accuracy_model_glove_lstm_cnn = model_glove_lstm_cnn.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (accuracy_model_glove[1]*100))

Accuracy: 93.96%


In [39]:
# Making predictions in terms of probabilities for each class
prediction_model_glove_lstm_cnn = model_glove_lstm_cnn.predict(X_test,batch_size=10,verbose=0)

In [40]:
prediction_model_glove_lstm_cnn

array([[3.2281414e-02, 9.6771866e-01],
       [2.1942887e-01, 7.8057110e-01],
       [1.7427020e-01, 8.2572979e-01],
       ...,
       [8.8800455e-04, 9.9911195e-01],
       [1.7128917e-03, 9.9828714e-01],
       [1.6644618e-02, 9.8335534e-01]], dtype=float32)

# Word embedding visialization

In [104]:
# Get embedding weights
cnn_embds = model_cnn.layers[0].get_weights()[0]
lstm_embds = model_lstm.layers[0].get_weights()[0]
conv_embds = model_conv.layers[0].get_weights()[0]
glove_emds = model_glove.layers[0].get_weights()[0]

In [48]:
# Creating word list
word_list = []
for word, i in tokenizer.word_index.items():
    word_list.append(word)

In [50]:
# Scatter plot of first two components of TSNE(t-distributed stochastic neighbor embedding)
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= word_list[start:stop:step]
    )
    layout = dict(title= 't-SNE 1 vs t-SNE 2',
                  yaxis = dict(title='t-SNE 2'),
                  xaxis = dict(title='t-SNE 1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [105]:
# CNN
cnn_tsne_embds = TSNE(n_components=2).fit_transform(cnn_embds)
plot_words(cnn_tsne_embds, 0, 2000, 1)

In [68]:
# LSTM
lstm_tsne_embds = TSNE(n_components=2).fit_transform(lstm_embds)
plot_words(lstm_tsne_embds, 0, 2000, 1)

In [69]:
# CNN + LSTM
conv_tsne_embds = TSNE(n_components=2).fit_transform(conv_embds)
plot_words(conv_tsne_embds, 0, 2000, 1)

In [70]:
# CNN + LSTM + GLOVE
glove_tsne_embds = TSNE(n_components=2).fit_transform(glove_emds)
plot_words(glove_tsne_embds, 0, 2000, 1)