## Practical 3. Text Classification with Convolutional Neural Network
### Strictly used for internal purpose in Singapore Polytechnic. Do not disclose!

In this notebook we will demonstrate different text classification models trained using the Twitter dataset.

In [None]:
#Make the necessary imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, Sequential
from keras.initializers import Constant

### Data Loading

In [None]:
#Load the  data and explore.
path = "data/Sentiment and Emotion in Text/train_data.csv"
data = pd.read_csv(path)
print(data.shape)
data.head()

In [None]:
# take 2 categories and leave out the rest.
shortlist = ['sadness', "happiness"]
data = data[data['sentiment'].isin(shortlist)]
labels_index = {'sadness':0, 'happiness':1} 
data['sentiment'] = data['sentiment'].map(labels_index)
X_train, X_test, Y_train, Y_test = train_test_split(data['content'], data['sentiment'], test_size=0.2, random_state=1234)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)

In [None]:
print(f'Training sample size: {len(X_train)}')
print(f'Validation sample size: {len(X_val)}')
print(f'Testing sample size: {len(X_test)}')

### Text pre-processing

In [None]:
# some parameters setting
MAX_SEQUENCE = 500   # maximum sentence length
MAX_WORDS = 10000   # maximum vocabulary size

In [None]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train) #Converting text to a vector of word indexes
val_sequences = tokenizer.texts_to_sequences(X_val)
test_sequences = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
#Converting this to sequences to be fed into neural network. max seq. len is the maximum length of sentence
 #initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
X_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE)
X_val = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE)
X_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE)
Y_train = to_categorical(np.asarray(Y_train))
Y_val = to_categorical(np.asarray(Y_val))
Y_test = to_categorical(np.asarray(Y_test))

## 1D CNN Model with training our own embedding

In [None]:
# Define a 1D CNN model., training our embedding layer

cnnmodel = Sequential()
cnnmodel.add(Embedding(input_dim=MAX_WORDS, input_length=MAX_SEQUENCE, output_dim=128))
cnnmodel.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnnmodel.add(MaxPooling1D(pool_size=5))
cnnmodel.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
cnnmodel.add(MaxPooling1D(pool_size=5))
cnnmodel.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(units=128, activation='relu'))
cnnmodel.add(Dense(len(labels_index), activation='softmax'))

optimizer = keras.optimizers.Adam(learning_rate=0.001,
                                  beta_1=0.9, 
                                  beta_2=0.999,
                                  epsilon=1e-8)
                                  
cnnmodel.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

### Q&A: Why Adam is a popular choice for NLP problem?

In [None]:
cnnmodel.summary()

In [None]:
%%time
#Train the model. Tune to validation set. 
cnnmodel.fit(X_train, Y_train,
          batch_size=128,
          epochs=10, validation_data=(X_val, Y_val))

#Evaluate on test set:
score, acc = cnnmodel.evaluate(X_test, Y_test)
print('Test accuracy with CNN:', acc)

## 1D CNN Model with pre-trained embedding

### Load pre-trained embedding matrix

In [None]:
print('Loading embedding matrix.')

# first, build index mapping words in the embeddings set to their embedding vector
embeddings_index = {}
with open('glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f'Load {len(embeddings_index)} word vectors in Glove embeddings.')

In [None]:
print(embeddings_index["family"])

In [None]:
# prepare embedding matrix
EMBEDDING_DIM = 50  # dimension for the pre-trained word embedding
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(MAX_WORDS,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE,
                            trainable=False)
print("Preparing of embedding matrix is done")

In [None]:
# Define a 1D CNN model

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnnmodel.add(MaxPooling1D(pool_size=5))
cnnmodel.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnnmodel.add(MaxPooling1D(pool_size=5))
cnnmodel.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(units=128, activation='relu'))
cnnmodel.add(Dense(len(labels_index), activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

#Train the model. Tune to validation set. 
cnnmodel.fit(X_train, Y_train,
          batch_size=32,
          epochs=10, validation_data=(X_val, Y_val))

#Evaluate on test set:
score, acc = cnnmodel.evaluate(X_test, Y_test)
print('Test accuracy with CNN:', acc)