In [1]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf
import re

In [2]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, Activation
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence, hashing_trick
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 999

##### Setting Google News Word2vec model path

In [None]:
google_model_path = '../../../LearnSpace/GoogleNews-vectors-negative300.bin.gz'

##### Loading Google News W2V model

In [None]:
google_w2v = gensim.models.KeyedVectors.load_word2vec_format(google_model_path, binary=True)

##### Regex way to remove stopwords (faster)

In [5]:
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('English')) + r')\b\s*')

#### Loading sentiment analysis dataframes

In [6]:
train_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')

In [7]:
test_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/test.tsv', sep='\t')

In [8]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [9]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine effort .
1,156062,8545,An intermittently pleasing but mostly routine effort
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


#### Getting the values from the dataframe

In [10]:
train_phrases = train_df['Phrase'].values
train_sentiment = train_df['Sentiment'].values

In [11]:
train_sentiment[:10]

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
test_phrases = test_df['Phrase'].values

### Checking number of labels

In [13]:
label_set = np.unique(train_sentiment)
labels = len(label_set)

In [14]:
labels

5

### Checking word tokenization with NLTK

#### Removing stop words

In [15]:
count = 0
for phrase in train_phrases:
    if count > 10:
        break
    count += 1
    phrase = pattern.sub('', phrase)
    print(word_tokenize(phrase))


['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', ',', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story', '.']
['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose']
['A', 'series']
['A']
['series']
['escapades', 'demonstrating', 'adage', 'good', 'goose']
[]
['escapades', 'demonstrating', 'adage', 'good', 'goose']
['escapades']
['demonstrating', 'adage', 'good', 'goose']
['demonstrating', 'adage']


### Checking Tokenization with Keras

##### Setting parameters

In [16]:
max_words = 10000

In [17]:
text_to_word_sequence("this is a test { }", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

['this', 'is', 'a', 'test']

In [18]:
text_to_word_sequence("The cat is in the hat!!!")

['the', 'cat', 'is', 'in', 'the', 'hat']

In [19]:
sentence = []
count = 0 
for phrase in train_phrases:
    if count > 10:
        break
    count += 1
    phrase = pattern.sub('', phrase.lower())
    words = text_to_word_sequence(phrase, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
#     for word in words:

### Preprocessing part 2

##### Using the hashing trick

In [20]:
type(train_phrases)

numpy.ndarray

### Training data

In [30]:
def pre_process(ndarray):
    '''
    Pre-processing data using the hashing trick
    '''
    sentence = []
    sentence_lengths = []
    input = np.ndarray([])
    max_val = 0
    max_val_list = []
    count = 0 
    for phrase in ndarray:
#         if count > 10:
#             break
        count += 1
        phrase = pattern.sub('', phrase.lower())
        words = text_to_word_sequence(phrase, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
        words = list((map(lambda x: hashing_trick(x, n=1000000, hash_function='md5'), words)))
        hashed_arr = np.array([int(word) for lists in words for word in lists])
        hashed_arr = [int(x) for x in hashed_arr]
        sentence_lengths.append(len(hashed_arr))
        sentence.append(hashed_arr)
        if hashed_arr:
            max_val = max(hashed_arr)
            max_val_list.append(max_val)
    
    return sentence, max_val_list, sentence_lengths

In [31]:
sentence, max_val_list, sentence_lengths = pre_process(train_phrases)

In [32]:
seq_len = np.round((np.mean(sentence_lengths) + 5*np.std(sentence_lengths))).astype(int)
x_train = pad_sequences(sentence, maxlen=seq_len)

In [33]:
seq_len

22

In [34]:
y_train_class = to_categorical(label_set, num_classes=labels, dtype='float32')

### Testing Data

In [35]:
sentence, max_val_list, sentence_lengths = pre_process(train_phrases)

In [36]:
y_train = pad_sequences(sentence, maxlen=seq_len)

#### Input a sentence

In [None]:
test_np = np.vstack((google_w2v['hello'], google_w2v['how'], google_w2v['are'],google_w2v['you']))

In [None]:
# keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', \
#                       embeddings_regularizer=None, activity_regularizer=None, \
#                       embeddings_constraint=None, mask_zero=False, input_length=None)

#### Building the model

In [37]:
max_embed = max(max_val_list)
input_length = x_train.shape[1]

In [38]:
model = Sequential()
# model.add(Embedding(1000, 64, input_length=22))
model.add(Embedding(max_embed+1, 64, input_length=input_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='sigmoid'))
model.add(Dense(labels, activation='sigmoid'))
model.add(Activation('softmax'))

In [39]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 22, 64)            63999488  
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
_________________________________________________________________
activation (Activation)      (None, 5)                 0         
Total params: 64,082,693
Trainable params: 64,082,693
Non-trainable params: 0
____________________________________________

#### Freezing the embedding layer

In [40]:
model.layers[0].trainable = False

In [41]:
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

In [None]:
output_array = model.predict(x_train)

In [43]:
model.output_shape

(None, 5)

#### Training model

In [44]:
batch_size = 32

In [45]:
to_categorical(train_sentiment[:11]).shape

(11, 3)

In [None]:
print('Train...')
model.fit(x_train, to_categorical(train_sentiment),
          batch_size=batch_size,
          epochs=4)

Train...
Train on 156060 samples
Epoch 1/4

In [None]:
to_categorical(label_set, num_classes=labels, dtype='float32')