# Trip Advisor Model Training Notebook
- 90000 rows in balanced training set, 10000 in validation, roughly 40000 in test

### Preprocessing TA Data and Splitting into Test and Train data

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

os.chdir('/home/ubuntu/Notebooks/capstone2/src')
from training_data_cleaning import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from functions import embedding_mat
from nltk.stem import WordNetLemmatizer
import nltk
import warnings
import pickle

os.chdir('/home/ubuntu/Notebooks/data/ta')
ta_data = ta_data_cleaning(pd.read_pickle('ta_data2.pickle'))
ta_data_train = ta_data.replace(1.0,2.0).replace(0.0,2.0).sample(n=150000,random_state=3)
train_idx = ta_data_train.index
test_idx = [num for num in ta_data.index if num not in train_idx]

ta_data.loc[test_idx].to_pickle('/home/ubuntu/Notebooks/data/ta/ta_test.pickle')
X = ta_data_train

del ta_data
del train_idx
del test_idx

X,y = balance_df_ta(X)


Using TensorFlow backend.


## LSTM
- Import word_vectors
- preprocess review data and convert to sequence of integers
- pad sequences
- build model
- train and test

#### Importing word2vec

In [91]:
# Import pretrained word2vec word embedding vectors
os.chdir('/home/ubuntu/Notebooks')
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz',
                                                 binary=True,
                                                 limit=3000000)

#### preprocess/tokenize/pad

In [2]:
avg_rev_length = int(X.apply(lambda x: len(x.split())).mean())
avg_rev_length

161

In [98]:
x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=.1)
avg_rev_length = int(X.apply(lambda x: len(x.split())).mean())
avg_rev_length

### hyperparams for NN Model
maxlen = avg_rev_length
training_samples = x_train.shape[0]
validation_samples = x_valid.shape[0]
batch_size = 25
embedding_dims = 300
epochs = 2
embedding_dim = 300

token = Tokenizer(char_level=False,lower=True)
token.fit_on_texts(X)

# token.texts_to_sequences(x_train)
x_train_seq = pad_sequences(token.texts_to_sequences(x_train),maxlen=maxlen)
x_valid_seq = pad_sequences(token.texts_to_sequences(x_valid),maxlen=maxlen)

word_index = token.word_index
max_words = len(word_index)
embedding_matrix = embedding_mat(max_words,embedding_dim,word_index,word_vectors)

In [105]:
ohe = OneHotEncoder()
y_train_new = ohe.fit_transform(y_train.values.reshape(-1,1))
y_valid_new = ohe.fit_transform(y_valid.values.reshape(-1,1))

# x_valid_seq2 = x_valid_seq.reshape(x_valid_seq.shape[0],x_valid_seq.shape[1],1)
# x_train_seq2 = x_train_seq.reshape(x_train_seq.shape[0],x_train_seq.shape[1],1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### LSTM Model Building

In [106]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding

model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Dropout(rate=.2))
model.add(LSTM(batch_size,return_sequences=False))
model.add(Dense(4,activation='softmax'))
model.layers[0].set_weights([embedding_matrix])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [107]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',metrics=['acc'])
history = model.fit(x_train_seq,y_train_new.toarray(),epochs=epochs,
                     batch_size=batch_size,
                     validation_data=(x_valid_seq,
                                      y_valid_new))

Instructions for updating:
Use tf.cast instead.
Train on 89025 samples, validate on 9892 samples
Epoch 1/2
Epoch 2/2


In [75]:
os.chdir('/home/ubuntu/Notebooks/data')

In [109]:
model.save('first_model_ta.h5')

In [110]:
os.chdir('/home/ubuntu/Notebooks/capstone2/data')
f = open("word_dict_ta_new.pkl","wb")
pickle.dump(word_index,f)
f.close()