# Entity Extraction and Sentiment Analysis
## 1. Entity Extraction

## 2. Sentiment Analysis on Yelp data set
This is the training process on Yelp dataset, if you want to chech what we trained on IMDB, please check /Notebook_files/Sentiment_analysis_binary_classifier.ipynb

In [2]:
import numpy as np
import pandas as pd
import json
import re
import os
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, concatenate
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Sequential, Model
from keras.models import model_from_json
from nltk.corpus import stopwords
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

In [3]:
#Merge all useful information
file = open('data/review.json', 'r')
review_data = []
for i in file:
    samples = json.loads(i)
    review_data.append([samples['review_id'],samples['user_id'],samples['business_id'],samples['stars'],samples['text']])

In [4]:
df_reviews = pd.DataFrame.from_dict(data = review_data)

In [5]:
df_reviews.columns = ['review_id','user_id','business_id','stars','text']

In [6]:
# see some examples before cleaning up
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
0,v0i_UHJMo_hPBq9bxWvW4w,bv2nCi5Qv5vroFiqKGopiw,0W4lkclzZThpx3V65bVgig,5,"Love the staff, love the meat, love the place...."
1,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,Super simple place but amazing nonetheless. It...
2,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,Small unassuming place that changes their menu...
3,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,Lester's is located in a beautiful neighborhoo...
4,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,Love coming here. Yes the place always needs t...


In [7]:
# remove the stopwords
def cleanup(sentence):
    cleanup_re = re.compile('[^a-z]+')
    sentence = sentence.lower()
    sentence = cleanup_re.sub(' ', sentence).strip()
    text = sentence.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text


In [8]:
texts=[]
for i in df_reviews.index:
    text = cleanup(df_reviews['text'][i])
    texts.append(text)

In [11]:
# see some examples after cleaning up texts
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
0,v0i_UHJMo_hPBq9bxWvW4w,bv2nCi5Qv5vroFiqKGopiw,0W4lkclzZThpx3V65bVgig,5,love staff love meat love place prepare long l...
1,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,super simple place amazing nonetheless around ...
2,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,small unassuming place changes menu every ofte...
3,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,lester located beautiful neighborhood since kn...
4,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,love coming yes place always needs floor swept...


In [31]:
df_reviews.to_csv('data/reviews.csv', index = False)

In [None]:
df_reviews = pd.read_csv('data/reviews.csv', index_col=False)

In [12]:
# use Glove to implement word embedding method
def get_word_embedding(DIR):    
    d = {}
    f = open(DIR)
    for line in f:
        v = line.split()
        word = v[0]
        vec = np.asarray(v[1:], dtype='float32')
        d[word] = vec
    f.close()
    return d
GLOVE_DIR = "glove.6B/glove.6B.100d.txt"
word_embeddings = get_word_embedding(GLOVE_DIR)

In [13]:
# make sure all the next are strings
df_reviews['text'] = df_reviews['text'].astype(str)

In [27]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [35]:
# get the target values
y = df_reviews['stars']

In [15]:
# set some parameters
MAX_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1

In [16]:
# change the word-embedding matrix to sequence data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df_reviews['text'])


In [17]:
sequences = tokenizer.texts_to_sequences(df_reviews['text'])
word_index = tokenizer.word_index
LSTM_data = pad_sequences(sequences, maxlen=MAX_LENGTH)
indices = np.arange(LSTM_data.shape[0])
np.random.shuffle(indices)
y_shuffled = y[indices]
LSTM_data_shuffled = LSTM_data[indices]


In [25]:
# store word embedding dict for later use
import pickle


with open('word_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('word_index.pickle', 'rb') as handle:
    b = pickle.load(handle)

print (word_index == b)

True


In [40]:
# cross-val implementation
nb_validation_samples = int(VALIDATION_SPLIT * LSTM_data_shuffled.shape[0])
x_train = LSTM_data_shuffled[:-nb_validation_samples]
y_train = y_shuffled[:-nb_validation_samples]
x_val = LSTM_data_shuffled[-nb_validation_samples:]
y_val = y_shuffled[-nb_validation_samples:]

In [45]:
np.save('data/x_train.npy',x_train)
np.save('data/y_train.npy',y_train)
np.save('data/x_val.npy',x_val)
np.save('data/y_val.npy',y_val)

In [41]:
# use embedding layer in keras
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print ('Length of embedding_matrix:', embedding_matrix.shape[0])
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            mask_zero=False,
                            input_length=MAX_LENGTH,
                            trainable=False)


Length of embedding_matrix: 668401
Traing and validation set number of positive and negative reviews
17651584
1962548


In [42]:
# define rmse function
import keras.backend as K
def rmse(y_true, y_pred):
    foo_1 = K.cast(y_true, dtype='float32')
    foo_2 = K.cast(y_pred, dtype='float32')
    foo_3 = K.round(foo_2)
    return K.sqrt(K.mean(K.square(foo_1 - foo_3)))

In [43]:
# create a graph of our model
sequence_input = Input(shape=(MAX_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
X = Bidirectional(LSTM(128, return_sequences=False))(embedded_sequences)  
dense_1 = Dense(128,activation='tanh')(X)  
dense_2 = Dense(32, activation='relu')(dense_1) 
dense_3 = Dense(8, activation='relu')(dense_2)
dense_4 = Dense(4, activation='relu')(dense_3)
dense_5 = Dense(1)(dense_4) 
model = Model(sequence_input, dense_5)  

model.compile(loss='mean_squared_error',  
              optimizer='adam',  
              metrics=['acc',rmse])  

model.summary()  

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         66840100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 36        
__________

In [44]:
# training on the yelp dataset, just show 1 epoch since it takes a lot of time. 
# We actually trained our mdoel on a server.
filepath="model/LSTM.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=rmse, verbose=1, save_best_only=True, mode='min',period=1)
earlystopping = EarlyStopping(monitor=rmse, min_delta=0.001, patience=5, verbose=1, mode='min')
callbacks_list = [checkpoint, earlystopping]
print('Train...')
print(model.summary())
history = model.fit(x=x_train, y=y_train, shuffle=True, validation_data=(x_val, y_val), callbacks=callbacks_list, 
          epochs=20, batch_size=128)

Train...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         66840100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 36        
_

KeyboardInterrupt: 

In [46]:
# store the model
LSTM_model_json = model.to_json()
with open('model/b_lstm_final.json','w') as json_file:
    json_file.write(LSTM_model_json)


In [None]:
model.save_weights('model/b_lstm_final.h5')