## Practical 4. Text Classification with LSTM Neural Network
### Strictly used for internal purpose in Singapore Polytechnic. Do not disclose!

In this notebook we will demonstrate different text classification models trained using the sentiment and emotion dataset.

In [1]:
#Make the necessary imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant

### Data Loading

In [2]:
#Load the  data and explore.
path = "data/Sentiment and Emotion in Text/train_data.csv"
data = pd.read_csv(path)
print(data.shape)
data.head()

(30000, 2)


Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [3]:
# take 2 categories and leave out the rest.
shortlist = ['sadness', "happiness"]
data = data[data['sentiment'].isin(shortlist)]
labels_index = {'sadness':0, 'happiness':1} 
data['sentiment'] = data['sentiment'].map(labels_index)
X_train, X_test, Y_train, Y_test = train_test_split(data['content'], data['sentiment'], test_size=0.2, random_state=1234)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)

In [4]:
print(f'Training sample size: {len(X_train)}')
print(f'Validation sample size: {len(X_val)}')
print(f'Testing sample size: {len(X_test)}')

Training sample size: 5000
Validation sample size: 1251
Testing sample size: 1563


### Text pre-processing

In [5]:
# some parameters setting
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 50 

In [6]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train) #Converting text to a vector of word indexes
val_sequences = tokenizer.texts_to_sequences(X_val)
test_sequences = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 10944 unique tokens.


In [7]:
#Converting this to sequences to be fed into neural network. max seq. len is the maximum length of sentence
 #initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
X_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
Y_train = to_categorical(np.asarray(Y_train))
Y_val = to_categorical(np.asarray(Y_val))
Y_test = to_categorical(np.asarray(Y_test))

In [8]:
print('Preparing embedding matrix.')

# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
with open('glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.


In [9]:
print(embeddings_index["king"])

[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]


In [10]:
# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print("Preparing of embedding matrix is done")

Preparing of embedding matrix is done


In [20]:
embedding_dimension = embedding_layer.output_dim
print(f'Embedding Dimension: {embedding_dimension}')

Embedding Dimension: 50


## LSTM Model with pre-trained embedding

In [24]:
# Please fill your code here


## LSTM Model with training our own embedding

In [None]:
# Please fill your code here

## Bidirectional LSTM Model

In [25]:
# Please try to fill your code here
