In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import keras
from keras import Model
from tensorflow.keras.layers import Flatten,LSTM, Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras_preprocessing.text import Tokenizer
from keras.initializers import glorot_uniform
from sklearn import model_selection


In [4]:
#Read in data
with open('/Users/saurmoha/Downloads/archive-3/train.csv', 'r') as file:
    text = file.readlines()

In [5]:
#create empty dataframe
x_train = pd.DataFrame()

In [6]:
# fill in dataframe
word=[]
label=[]
for n in text:
    n=n.split()
    label.append(1) if n[0] =="__label__2" else label.append(0)
    word.append(" ".join(n[1:]))
x_train['consumer_review'] = word
x_train['polarity_label'] = label

In [7]:
#view dataframe
x_train

Unnamed: 0,consumer_review,polarity_label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1
...,...,...
3599995,Don't do it!!: The high chair looks great when...,0
3599996,"Looks nice, low functionality: I have used thi...",0
3599997,"compact, but hard to clean: We have a small ho...",0
3599998,what is it saying?: not sure what this book is...,0


In [8]:
_, x_set,_, y_set = \
    model_selection.train_test_split(x_train['consumer_review'], 
                                     x_train['polarity_label'], test_size=0.02)

In [9]:
#data cleaning function
def data_prep(in_tex):
    # Remove punctuations and numbers
    out_tex = re.sub('[^a-zA-Z]', ' ', in_tex)
    # Convert upper case to lower case
    out_tex="".join(list(map(lambda x:x.lower(),out_tex)))
    # Remove single character
    out_tex= re.sub(r"\s+[a-zA-Z]\s+", ' ', out_tex)
    return out_tex

In [10]:
#create new list with clean data
text_set=[]
for reviews in list(x_set):
    text_set.append(data_prep(reviews))

In [11]:
x_train= pd.DataFrame()
x_train['consumer_review'] = text_set
x_train['polarity_label'] = list(y_set)


In [12]:
#split data into 70% train and 30% test
x_train, x_test, y_train, y_test = \
    model_selection.train_test_split(x_train['consumer_review'], 
                                     x_train['polarity_label'], test_size=0.30)

In [13]:
#convert to array
x_train=np.array(x_train.values.tolist())
x_test=np.array(x_test.values.tolist())
y_train=np.array(y_train.values.tolist())
y_test=np.array(y_test.values.tolist())

In [14]:
#tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
word_index=tokenizer.word_index
total_size = len(word_index)+1

In [15]:
print(total_size)

77547


In [16]:
#text to sequence
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [17]:
#add padding to ensure the same length
max_length = 100
x_train = pad_sequences(x_train, padding='post', maxlen=max_length)
x_test = pad_sequences(x_test, padding='post', maxlen=max_length)


In [18]:
#Create Model
model = Sequential()
model.add(Embedding(total_size, 20, input_length=max_length))
model.add(LSTM(32,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


In [19]:
#compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           1550940   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6784      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 1,557,757
Trainable params: 1,557,757
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
model.fit(x_train, y_train, batch_size=128, epochs=5, verbose=1, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a7d29520>

In [21]:
model.save("model.h5")

In [22]:
model = keras.models.load_model("model.h5")