# **A C-LSTM Neural Network for Text Classification**

Convolutional neural network (CNN)
and recurrent neural network (RNN) are two
mainstream architectures for such modeling
tasks, which adopt totally different ways of
understanding natural languages. 

In this work,
we combine the strengths of both architectures
and propose a novel and unified model called
C-LSTM for sentence representation and text
classification. 

C-LSTM utilizes CNN to extract a sequence of higher-level phrase representations, and are fed into a long short-term
memory recurrent neural network (LSTM) to
obtain the sentence representation. 

C-LSTM
is able to capture both local features of phrases
as well as global and temporal sentence semantics. We evaluate the proposed architecture on sentiment classification task.

# Reference

Zhou et al, A C-LSTM Neural Network for Text Classification, arXiv:1511.08630 [cs.CL]

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import keras
from keras.models import Sequential
import keras.layers
from keras.layers import Dense, Embedding, LSTM, GRU, Input, Reshape, Concatenate, Permute, Activation, multiply, Lambda, Conv2D, Bidirectional, Flatten
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras import Model
import keras.utils.np_utils
import tensorflow as tf
from keras import Model
import tensorflow.keras.backend as K

In [3]:
data = pd.read_csv("/content/drive/MyDrive/IISc_Assignment/DLNLP_A4/TrainData.csv")

In [4]:
data.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [5]:
data['Text'][0]



In [6]:
len(data)

1490

In [4]:
category = list(data['Category'])

In [8]:
np.unique(category)

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype='<U13')

In [5]:
d = {'business' : 0, 'entertainment' : 1, 'politics' : 2, 'sport' : 3, 'tech' : 4}

In [6]:
labels = []

for c in category:
  labels.append(d[c])

In [7]:
train_target = np.array(labels)

In [11]:
# train_target = keras.utils.np_utils.to_categorical(labels)
# train_target.shape

(1490, 5)

In [8]:
punctuations = '''!()-[]{};:'"\,<>./?@#%^&*_~0123456789'''

pre_data = []
for sentence in data['Text']:
  for ele in sentence:
    if (ele in punctuations):
        sentence = sentence.replace(ele, "")
  pre_data.append(sentence)

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
lematize_data = []

lemmatizer = WordNetLemmatizer()

for s in pre_data:
  l = []
  for w in s.split():
    w = lemmatizer.lemmatize(w)
    l.append(w)
  
  # pre_data.append(l)
  lematize_data.append(" ".join([i for i in l]))

In [15]:
lematize_data[0]



In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

final_data = []
for sentence in lematize_data:
    final_data.append(" ".join([words for words in sentence.split() if not (words in stop_words or len(words) <=2)]))

In [18]:
final_data[0]



In [13]:
max_len = max([len(s.split()) for s in final_data])

In [20]:
max_len

1604

In [21]:
len_list = [len(s.split()) for s in final_data]

In [14]:
max_len = 500

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data)
sequences = tokenizer.texts_to_sequences(final_data)

#pad sequences
train_word_index = tokenizer.word_index
print("number of unique tokens = ", len(train_word_index))



train_padded_sequences = pad_sequences(sequences, padding='post', maxlen = max_len)

number of unique tokens =  22479


In [None]:
train_padded_sequences[0]

In [24]:
type(train_word_index)

dict

In [15]:
from gensim.models import Word2Vec, KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/GoogleNews-vectors-negative300.bin", binary=True)

In [16]:
unique_words = len(train_word_index) + 1
w2v_embedding_dim = 300
w2v_embedding = np.zeros((unique_words, w2v_embedding_dim))
count = 0
for word, i in train_word_index.items():
  # if i > num_words:
  #   continue
  if word in w2v_model.vocab:
    embedding = w2v_model.wv[word]
  else:
    embedding = np.random.uniform(-0.25, 0.25, w2v_embedding_dim)
    count += 1
  

  if embedding is not None:
    w2v_embedding[i] = embedding

  if __name__ == '__main__':


# **CNN-LSTM Model**

In [17]:
classes = 5
filters = 16
k=5


model_input = Input(shape = (max_len,))

Embedding(unique_words, output_dim = w2v_embedding_dim, weights = [w2v_embedding], trainable = False)

embedding = Embedding(unique_words, output_dim = w2v_embedding_dim, weights = [w2v_embedding], trainable = False)(model_input)

embedding_resize = tf.expand_dims(embedding, -1)

conv_out = Conv2D(filters, (k, w2v_embedding_dim), activation="relu")(embedding_resize)

conv_out = tf.squeeze(conv_out, [2])

# conv_out = conv_out[: , :max_len-k+1, : ]

lstm_out = Bidirectional(LSTM(units= w2v_embedding_dim, dropout=0.2))(conv_out)

model_out = Dense(classes )(lstm_out)

cnn_lstm_model = Model(model_input, model_out)

In [64]:
cnn_lstm_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 500, 300)          6744000   
_________________________________________________________________
tf.expand_dims_7 (TFOpLambda (None, 500, 300, 1)       0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 496, 1, 16)        24016     
_________________________________________________________________
tf.compat.v1.squeeze_7 (TFOp (None, 496, 16)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 600)               760800    
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 3005

In [20]:
cnn_lstm_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [21]:
cnn_lstm_model.fit(train_padded_sequences, train_target, batch_size=64, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f118cfd8790>

# **CNN-LSTM Model with attention**

In [24]:
from keras.layers.core import RepeatVector
classes = 5
filters = 16
k=5


model_input = Input(shape = (max_len,))

Embedding(unique_words, output_dim = w2v_embedding_dim, weights = [w2v_embedding], trainable = False)

embedding = Embedding(unique_words, output_dim = w2v_embedding_dim, weights = [w2v_embedding], trainable = False)(model_input)

embedding_resize = tf.expand_dims(embedding, -1)

conv_out = Conv2D(filters, (k, w2v_embedding_dim), activation="relu")(embedding_resize)

conv_out = tf.squeeze(conv_out, [2])

# conv_out = conv_out[: , :max_len-k+1, : ]

lstm_out = Bidirectional(LSTM(units= w2v_embedding_dim, dropout=0.2, return_sequences = True))(conv_out)

attention = Dense(1)(lstm_out)

attention = Flatten()(attention)

attention = RepeatVector(w2v_embedding_dim * 2)(attention)

attention = Permute([2,1])(attention)

attention_embedding = keras.layers.Multiply()([lstm_out, attention])

attention_embedding = keras.layers.Lambda(lambda xin: K.sum(xin, axis = -2), output_shape = (w2v_embedding_dim * 2,) )(attention_embedding)

model_out = Dense(classes )(attention_embedding)

cnn_lstm_attention_model = Model(model_input, model_out)

In [25]:
cnn_lstm_attention_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 500, 300)     6744000     input_5[0][0]                    
__________________________________________________________________________________________________
tf.expand_dims_4 (TFOpLambda)   (None, 500, 300, 1)  0           embedding_9[0][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 496, 1, 16)   24016       tf.expand_dims_4[0][0]           
____________________________________________________________________________________________

In [26]:
cnn_lstm_attention_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [27]:
cnn_lstm_attention_model.fit(train_padded_sequences, train_target, batch_size=64, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe0c5faabd0>