In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Conv1D, MaxPooling1D, Embedding
from keras.layers.merge import concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.model_selection import train_test_split

from pickle import dump, load

from nltk.corpus import stopwords

from os import listdir

from string import punctuation

In [19]:
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

In [2]:
# check for GPU

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

SystemError: GPU device not found

# Filenames

In [2]:
attribute_name = "OutdoorSeating"
concat_filename = attribute_name+'_>49.csv'
review_filename = attribute_name+'_review.csv'

# Define Functions

In [3]:
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens.lower()

In [4]:
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
#     plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [5]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])
 
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

# Business-level CNN

In [24]:
# read in dataset
res_concat = pd.read_csv(concat_filename)
print(res_concat.shape)

(3154, 3)


In [25]:
# clean 
cleaned_text = res_concat.text.apply(clean_doc)

# split into train and test
trainLines, testLines, y_train, y_test = train_test_split(
    cleaned_text, res_concat[attribute_name], test_size=0.2, random_state=0)

# preprocess and create X_train and X_test
tokenizer = create_tokenizer(trainLines)
length = max_length(trainLines)
vocab_size = len(tokenizer.word_index) + 1
# encode data
X_train = encode_text(tokenizer, trainLines, length)
X_test = encode_text(tokenizer, testLines, length)
print(X_train.shape, X_test.shape)

model = define_model(length, vocab_size)
# model.load_weights("OutdoorSeating_concat_weights.best.hdf5")

# model.summary()

(2523, 91141) (631, 91141)


In [26]:
# create checkpoints and save the best weights
filepath=attribute_name+"_concat_weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

# evaluate model on training dataset
history = model.fit([X_train,X_train,X_train], np.array(y_train), validation_split=0.1,
                    verbose=1, epochs=10, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.43083, saving model to GoodForKids_concat_weights.best.hdf5
Epoch 2/10
Epoch 00002: val_accuracy did not improve from 0.43083
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.43083
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.43083
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.43083
Epoch 6/10
Epoch 00006: val_accuracy did not improve from 0.43083
Epoch 7/10
Epoch 00007: val_accuracy did not improve from 0.43083
Epoch 8/10
Epoch 00008: val_accuracy improved from 0.43083 to 0.56917, saving model to GoodForKids_concat_weights.best.hdf5
Epoch 9/10
Epoch 00009: val_accuracy did not improve from 0.56917
Epoch 10/10
Epoch 00010: val_accuracy did not improve from 0.56917


In [27]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([X_test,X_test,X_test],np.array(y_test), verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 52.773374


# Review-level CNN 

In [7]:
# read in dataset
res_reviews = pd.read_csv(review_filename)
print(res_reviews.shape)

(100000, 2)


In [23]:
# clean 
cleaned_text = res_reviews.text.apply(clean_doc)

# split into train and test
trainLines, testLines, y_train, y_test = train_test_split(
    cleaned_text, res_reviews[attribute_name], test_size=0.3, random_state=seed_value)

# preprocess and create X_train and X_test
tokenizer = create_tokenizer(trainLines)
length = max_length(trainLines)
vocab_size = len(tokenizer.word_index) + 1
# encode data
X_train = encode_text(tokenizer, trainLines, length)
X_test = encode_text(tokenizer, testLines, length)
print(X_train.shape, X_test.shape)

model = define_model(length, vocab_size)
model.summary()

(70000, 540) (30000, 540)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 540)]        0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, 540)]        0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 540)]        0           []                               
                                                                                                  
 embedding_9 (Embedding)        (None, 540, 100)     6670200     ['input_10[0][0]']               
                                                                  

In [24]:
print(vocab_size)

66702


In [None]:
# create checkpoints and save the best weights
filepath="OutdoorSeating_review_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.load_weights("OutdoorSeating_review_model1_weights.hdf5")

# evaluate model on training dataset
history = model.fit([X_train,X_train,X_train], np.array(y_train), validation_split=0.1,
                    verbose=1, epochs=10, callbacks=callbacks_list)
# history = model.fit([X_train,X_train,X_train], np.array(y_train), validation_data=([X_test,X_test,X_test],np.array(y_test)),
#                     verbose=1, epochs=10, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.90171, saving model to OutdoorSeating_review_weights.hdf5
Epoch 2/10
 193/1969 [=>............................] - ETA: 8:29 - loss: 0.1043 - accuracy: 0.9649

In [None]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([X_test,X_test,X_test],np.array(y_test), verbose=0)
print('Test Accuracy: %f' % (acc*100))

In [None]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([X_train,X_train,X_train],np.array(y_train), verbose=0)
print('Train Accuracy: %f' % (acc*100))

In [None]:
# create checkpoints and save the best weights
filepath="OutdoorSeating_review_overfit_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.load_weights("OutdoorSeating_review_weights.hdf5")

# evaluate model on training dataset
history = model.fit([X_train,X_train,X_train], np.array(y_train), validation_data=([X_test,X_test,X_test],np.array(y_test)),
                    verbose=1, epochs=10, callbacks=callbacks_list)

In [None]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([X_train,X_train,X_train],np.array(y_train), verbose=0)
print('Train Accuracy: %f' % (acc*100))
# evaluate model on test dataset dataset
loss, acc = model.evaluate([X_test,X_test,X_test],np.array(y_test), verbose=0)
print('Test Accuracy: %f' % (acc*100))