In [4]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Input, Embedding, Activation, Dropout, concatenate, LSTM, Bidirectional
import re
import gensim

In [5]:
# read in csv
df = pd.read_csv('dataset.csv')
df = df.sample(400000)

In [6]:
# Splitting the data into Word2Vec training and testing data
df_wv_model, df = train_test_split(df, test_size=0.5, random_state=42)

In [7]:
# Converting all review text to string and preprocessing using gensim
df_wv_model['review_text'] = df_wv_model['review_text'].values.astype('str')
review_text = df_wv_model.review_text.apply(gensim.utils.simple_preprocess)

In [8]:
# Building a Word2Vec model using gensim
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers = 4
)

In [9]:
# Building the vocabulary
model.build_vocab(review_text)

In [10]:
# Training the model
model.train(review_text, total_examples = model.corpus_count, epochs = model.epochs)

(38891099, 52286355)

In [10]:
# Tokenizing the data for training and testing on the word2vec custom model
max_words = 40000
max_sequence_length = 100

df['review_text'] = df['review_text'].values.astype('str')
review_text = df['review_text']

tokenizer = Tokenizer(nb_words=max_words)
tokenizer.fit_on_texts(review_text)
sequences = tokenizer.texts_to_sequences(review_text)
review_text_data = pad_sequences(sequences, maxlen=max_sequence_length)



In [16]:
# Split into train and test data
y = df['review_score']
X_train, X_test, y_train, y_test = train_test_split(review_text_data, y, test_size=0.2, random_state=42)

In [42]:
# Create an an embedding matrix using custom model for the tokenized text
vocab = tokenizer.sequences_to_texts(review_text)

vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items() :
    try : 
        embedding_vector = model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        pass


In [43]:
# Creating the model
sentiment_model = Sequential()
Embed_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True)

review_input = Input(shape=(max_sequence_length,), dtype='int32', name = 'review_input')
review_embedding = Embed_layer(review_input)
Flatten_layer = Flatten()
review_flatten = Flatten_layer(review_embedding)
output_size = 1

dense1 = Dense(100, activation='relu')(review_flatten)
dense2 = Dense(32, activation='relu')(dense1)
predict = Dense(1, activation = 'sigmoid')(dense2)

sentiment_model = Model(inputs = [review_input], outputs = [predict])
sentiment_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(sentiment_model.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 review_input (InputLayer)   [(None, 100)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 100, 100)          13031100  
                                                                 
 flatten_2 (Flatten)         (None, 10000)             0         
                                                                 
 dense_6 (Dense)             (None, 100)               1000100   
                                                                 
 dense_7 (Dense)             (None, 32)                3232      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 14,034,465
Trainable params: 14,034,465
Non-t

In [44]:
# Fitting the model
sentiment_model.fit(X_train, y_train, epochs = 3, batch_size = 32, validation_data = (X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c9cf2e7110>

In [2]:
# Downloading glove word embeddings
import gensim.downloader as api
embeddings = api.load("glove-wiki-gigaword-100")

In [45]:
# Creating an embedding matrix using the glove model

embedding_matrix_glove = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items() :
    try : 
        embedding_vector = embeddings[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        pass

In [46]:
# Creating the model for glove embedding matrix
sentiment_model_glove = Sequential()
Embed_layer = Embedding(vocab_size, 100, weights=[embedding_matrix_glove], input_length=max_sequence_length, trainable=True)

review_input = Input(shape=(max_sequence_length,), dtype='int32', name = 'review_input')
review_embedding = Embed_layer(review_input)
Flatten_layer = Flatten()
review_flatten = Flatten_layer(review_embedding)
output_size = 1

dense1 = Dense(100, activation='relu')(review_flatten)
dense2 = Dense(32, activation='relu')(dense1)
predict = Dense(1, activation = 'sigmoid')(dense2)

sentiment_model_glove = Model(inputs = [review_input], outputs = [predict])
sentiment_model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(sentiment_model_glove.summary())

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 review_input (InputLayer)   [(None, 100)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 100, 100)          13031100  
                                                                 
 flatten_3 (Flatten)         (None, 10000)             0         
                                                                 
 dense_9 (Dense)             (None, 100)               1000100   
                                                                 
 dense_10 (Dense)            (None, 32)                3232      
                                                                 
 dense_11 (Dense)            (None, 1)                 33        
                                                                 
Total params: 14,034,465
Trainable params: 14,034,465
Non-t

In [47]:
# Fitting the glove model
sentiment_model_glove.fit(X_train, y_train, epochs = 3, batch_size = 32, validation_data = (X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c9d8acbc50>