In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow_hub as hub
import tensorflow as tf
import re

In [2]:
reviews = pd.read_csv("bq-100k-reviews.csv")

In [78]:
# reviews["RestaurantsPriceRange2"].describe()

count    100393.000000
mean          1.866246
std           0.623567
min           1.000000
25%           1.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: RestaurantsPriceRange2, dtype: float64

In [4]:
import re 

replace_puncts = {'`': "'", '′': "'", '“':'"', '”': '"', '‘': "'"}

strip_chars = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '[', ']', '>', '=', '+', '\\', '•',  '~', '@', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

puncts = ['!', '?', '$', '&', '/', '%', '#', '*','£']

def clean_str(x):
    x = str(x)
    
    x = x.lower()
    
    x = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)
    
    for k, v in replace_puncts.items():
        x = x.replace(k, f' {v} ')
        
    for punct in strip_chars:
        x = x.replace(punct, ' ') 
    
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
        
    x = x.replace(" '", " ")
    x = x.replace("' ", " ")
        
    return x

In [5]:
reviews['processed'] = reviews['text'].apply(clean_str)


In [6]:
# reviews['RestaurantsPriceRange2']

In [7]:
reviews['l'] = reviews['processed'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(reviews.l.mean()))
print("max length of sentence: " + str(reviews.l.max()))
print("std dev length of sentence: " + str(reviews.l.std()))

mean length of sentence: 129.15264012431146
max length of sentence: 1268
std dev length of sentence: 119.99864442876662


In [8]:
reviews['l'].describe()

count    100393.000000
mean        129.152640
std         119.998644
min           1.000000
25%          51.000000
50%          91.000000
75%         165.000000
max        1268.000000
Name: l, dtype: float64

In [9]:
sequence_length = 300
max_features = 20000 # this is the number of words we care about

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(reviews['processed'].values)

# this takes our sentences and replaces each word with an integer
X = tokenizer.texts_to_sequences(reviews['processed'].values)

# we then pad the sequences so they're all the same length (sequence_length)
X = tf.keras.preprocessing.sequence.pad_sequences(X, sequence_length)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

y = pd.get_dummies(reviews['RestaurantsPriceRange2']).astype(float).values

# lets keep a couple of thousand samples back as a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print("test set size " + str(len(X_test)))


test set size 10040


numpy.int32

In [13]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 87148 unique tokens.


In [14]:
import os 
embeddings_index = {}
f = open(os.path.join('/Users/sudharshan/data/glove.6B', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:
num_words = min(max_features, len(word_index)) + 1
print(num_words)

embedding_dim = 100

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

20001


In [18]:
from tensorflow.keras.layers import *
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.initializers import Constant
model = tf.keras.models.Sequential()

model.add(Embedding(num_words,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=sequence_length,
                    trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.25))
model.add(Dense(units=4, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

W0612 03:35:07.923534 4459541952 deprecation.py:506] From /anaconda2/envs/work/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0612 03:35:07.924489 4459541952 deprecation.py:506] From /anaconda2/envs/work/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0612 03:35:07.925212 4459541952 deprecation.py:506] From /anaconda2/envs/work/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) w

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          2000100   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 300, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 300, 128)          84480     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 260       
Total params: 2,126,056
Trainable params: 2,126,056
Non-trainable params: 0
____________________________________________

In [16]:
# model = tf.keras.Sequential([
# tf.keras.layers.Dense(64, activation='relu', input_shape=(300,)),
# tf.keras.layers.Dense(64, activation='relu'),
# tf.keras.layers.Dense(4, activation='softmax')])
# model.compile(optimizer=tf.train.AdamOptimizer(0.001),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

W0612 03:34:25.094464 4459541952 deprecation.py:506] From /anaconda2/envs/work/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [19]:
batch_size = 128
X_train.shape
# y_train.shape
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.1)

Train on 81317 samples, validate on 9036 samples


W0612 03:35:30.394325 4459541952 deprecation.py:323] From /anaconda2/envs/work/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model.evaluate(x=X_test, y=y_test, batch_size=batch_size, verbose=1)




[0.639458717720442, 0.7321713]

test set size 10040
