# AI Apprentice Lab 8 Solution
#### Sentiment Classification for text

1. Apply Stemmer
2. Tokenize and retokenize
3. Pad sequences
4. Separate training and testing data
5. Define Bidirectional LSTM model
6. Train the LSTMmodel on training data
7. Evaluate results


In [1]:
#      IMPORT REQUIRED LIBRARIES
import pandas
import numpy as np
import sklearn
import glob
import os

#       LOAD DATA INTO PYTHON
datalist = []
labellist = []
train_path = "Data/aclImdb/train/"
test_path = "Data/aclImdb/test/"

for subdir, dir, files in os.walk(train_path):
    if subdir == train_path:
        continue
    elif subdir == "Data/aclImdb/train/neg":
        for text_path in glob.glob(subdir + "/*.txt"):
            text_file = open(text_path, "r")
            words = text_file.read().lower()
            words = words.replace(',','')
            words = words.replace('.','')
            words = words.replace('?','')
            words = words.replace('!','')
            datalist.append(words)
            labellist.append([0])
    elif subdir == "Data/aclImdb/train/pos":
        for text_path in glob.glob(subdir + "/*.txt"):
            text_file = open(text_path, "r")
            words = text_file.read().lower()
            words = words.replace(',','')
            words = words.replace('.','')
            words = words.replace('?','')
            words = words.replace('!','')
            datalist.append(words)
            labellist.append([1])
datalist = pandas.DataFrame(datalist)
labellist = pandas.DataFrame(labellist)

#### Let's now load all the necessary objects from Keras in advance

In [2]:
#     LOAD REQUIRED AI LIBRARIES OBJECTS
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,  Dropout, Flatten
from keras.layers import LSTM, Conv1D, Input, MaxPooling1D, Bidirectional
from keras.layers.embeddings import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Here we apply the stemmer to our review data

In [3]:
#Stem
stemmer = SnowballStemmer("english")
#TODO: Apply the Stemmer to datalist sentences
for i in (datalist):
    datalist[i] = datalist[i].apply(lambda x: " ".join(stemmer.stem(p) for p in x.split(" ") if not x.isdigit()))

#### Now we need to tokenize the words from the dataset and return numeric sequences that can be fed into a neural network

In [4]:
#Tokenize words
max_nb_words = 100000
tokenizer = Tokenizer(num_words=max_nb_words)
#TODO: Apply Tokenizer to datalist stems
tokenizer.fit_on_texts(datalist[0])
sequences = tokenizer.texts_to_sequences(datalist[0])
# Retokenize
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
max_nb_words = len(word_index)
#TODO: Retokenize with new amount of words
tokenizer = Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(datalist[0])
sequences = tokenizer.texts_to_sequences(datalist[0])
word_index = tokenizer.word_index
#TODO: Pad obtained Sequences according to maximum post length
max_post_len = np.max([ len(x) for x in sequences])
sequences = sequence.pad_sequences(sequences, maxlen=max_post_len) 

Found 74034 unique tokens.


#### The data is now ready, so we can split it and create the Bidirectional LSTM model|

In [5]:
#TODO: Split Training and Testing data
X_train, X_test, y_train, y_test = train_test_split(sequences, labellist, test_size=0.25, stratify=labellist, random_state=42)

In [6]:
## LSTM Sequential Model

#Parameters
batch_size = 128
epochs = 3
embedding_vecor_length = 32
lstm_size = 16


#Model
model = Sequential()
#TODO: Add Embedding layer with input size of sequence followed by a Dropout layer
model.add(Embedding(max_nb_words, embedding_vecor_length, input_length=max_post_len))
model.add(Dropout(0.25))
#TODO: Add a Bidirectional LSTM layer followed by a Dropout layer
model.add(Bidirectional(LSTM(lstm_size)))
model.add(Dropout(0.25))
#TODO: Add a Dense layer with a single output neuron and sigmoid activation for binary prediction
model.add(Dense(1, activation='sigmoid'))
##############################################################################################################
#################---------            COMPILE THE MODEL         --------######################################
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


#TODO: Train the Model on training data
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split = 0.1);

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2493, 32)          2369088   
_________________________________________________________________
dropout_1 (Dropout)          (None, 2493, 32)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32)                6272      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,375,393
Trainable params: 2,375,393
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16875 samples, validate on 1875 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


#### Once the model is trained we can evaluate on new data it never used for validation

In [7]:
#TODO: Evaluate the model
scores_ts = model.evaluate(X_test, y_test, verbose=1)
print(scores_ts[1])

0.8734400272369385


*Created by Nicholas Stepanov: https://github.com/renowator*