In [20]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import SimpleRNN, Dense, Input, Embedding, Activation, Dropout, concatenate, Flatten
import re
import gensim

In [2]:
# read in csv
df = pd.read_csv('dataset.csv')
df = df.sample(400000)

In [3]:
# Splitting the data into Word2Vec training and testing data
df_wv_model, df = train_test_split(df, test_size=0.5, random_state=42)

In [4]:
# Converting all review text to string and preprocessing using gensim
df_wv_model['review_text'] = df_wv_model['review_text'].values.astype('str')
review_text = df_wv_model.review_text.apply(gensim.utils.simple_preprocess)

In [5]:
# Building a Word2Vec model using gensim
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers = 4
)

In [6]:
# Building the vocabulary
model.build_vocab(review_text)

In [7]:
# Training the model
model.train(review_text, total_examples = model.corpus_count, epochs = model.epochs)

(39096015, 52539275)

In [8]:
# Tokenizing the data for training and testing on the word2vec custom model
max_words = 40000
max_sequence_length = 100

df['review_text'] = df['review_text'].values.astype('str')
review_text = df['review_text']

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(review_text)
sequences = tokenizer.texts_to_sequences(review_text)
review_text_data = pad_sequences(sequences, maxlen=max_sequence_length)

In [9]:
# Split into train and test data
y = df['review_score'].values.astype('float32')
X_train, X_test, y_train, y_test = train_test_split(review_text_data, y, test_size=0.2, random_state=42)

In [10]:
# Create an an embedding matrix using custom model for the tokenized text
vocab = tokenizer.sequences_to_texts(review_text)

vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items() :
    try : 
        embedding_vector = model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        pass


In [23]:
# Creating the model
sentiment_model = Sequential()
Embed_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False)

review_input = Input(shape=(max_sequence_length,), dtype='int32', name = 'review_input')
review_embedding = Embed_layer(review_input)
rnn_layer = SimpleRNN(units=32)(review_embedding)
flatten_layer = Flatten()(rnn_layer)
dropout_layer = Dropout(0.2)(flatten_layer)
dense1 = Dense(100, activation='relu')(dropout_layer)
dense2 = Dense(32, activation='relu')(dense1)
predict = Dense(1, activation = 'sigmoid')(dense2)

sentiment_model = Model(inputs = [review_input], outputs = [predict])
sentiment_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(sentiment_model.summary())

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 review_input (InputLayer)   [(None, 100)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 100, 100)          13133900  
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 32)                4256      
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_12 (Dense)            (None, 100)               3300      
                                                                 
 dense_13 (Dense)            (None, 32)                3232

In [24]:
# Fitting the model
sentiment_model.fit(X_train, y_train, epochs = 3, batch_size = 32, validation_data = (X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x220fa2d4590>

In [16]:
# Downloading glove word embeddings
import gensim.downloader as api
embeddings = api.load("glove-wiki-gigaword-100")

In [17]:
# Creating an embedding matrix using the glove model

embedding_matrix_glove = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items() :
    try : 
        embedding_vector = embeddings[word]
        if embedding_vector is not None:
            embedding_matrix_glove[i] = embedding_vector
    except KeyError:
        pass

In [25]:
# Creating the model for glove embedding matrix
sentiment_model_glove = Sequential()
Embed_layer = Embedding(vocab_size, 100, weights=[embedding_matrix_glove], input_length=max_sequence_length, trainable=False)

review_input = Input(shape=(max_sequence_length,), dtype='int32', name = 'review_input')
review_embedding = Embed_layer(review_input)
rnn_layer = SimpleRNN(units=32)(review_embedding)
flatten_layer = Flatten()(rnn_layer)
dropout_layer = Dropout(0.2)(flatten_layer)
dense1 = Dense(100, activation='relu')(dropout_layer)
dense2 = Dense(32, activation='relu')(dense1)
predict = Dense(1, activation = 'sigmoid')(dense2)

sentiment_model_glove = Model(inputs = [review_input], outputs = [predict])
sentiment_model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(sentiment_model_glove.summary())

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 review_input (InputLayer)   [(None, 100)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 100, 100)          13133900  
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 32)                4256      
                                                                 
 flatten_1 (Flatten)         (None, 32)                0         
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                                 
 dense_15 (Dense)            (None, 100)               3300      
                                                                 
 dense_16 (Dense)            (None, 32)                3232

In [26]:
# Fitting the glove model
sentiment_model_glove.fit(X_train, y_train, epochs = 3, batch_size = 32, validation_data = (X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x220fa31f910>