In [1]:
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
import gzip
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional
import pickle
from datetime import datetime

# Import Data

In [2]:
data = []
with gzip.open('../data/reviews_Video_Games_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
df = pd.DataFrame.from_dict(data).dropna().reset_index(drop=True)
X = df.loc[:,'reviewText']
y = df.loc[:,'overall']

In [3]:
# only work with a subset of the data for the purpose of speed
X = X[:10_000]
y = y[:10_000]

# Clean Data

In [4]:
def fivestar_to_binary(number):
    """Convert 1-5 rating scale to binary 'good' or 'bad' reviews"""
    return [1] if number>3 else [0]

In [5]:
y = y.apply(fivestar_to_binary)

In [6]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [7]:
def remove_stopwords(article):
    for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
    return article

In [8]:
# split into train, test, validation sets (60, 20, 20% respectively)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

Note that we will not use the test set for anything other than evaluation of our text attacks.

In [9]:
#save test data for later
with open('../data/train.pickle', 'wb') as wf:
    testdata = (X_test,y_test)
    pickle.dump(testdata, wf)

# Build model

In [10]:
vocab_size = 100_000
embedding_dim = 120
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

### Tokenizer

In [11]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [12]:
def preprocess(data, tokenizer):
    new_data = []
    for datapoint in data:
        new_data.append(remove_stopwords(datapoint))
    # turn strings into sequences:
    new_data = tokenizer.texts_to_sequences(new_data)
    # pad sequences
    return pad_sequences(new_data, maxlen=max_length, padding=padding_type, truncating=trunc_type)

### LSTM

In [13]:
# build the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(120, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(2, activation="sigmoid"))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 120)         12000000  
_________________________________________________________________
dropout (Dropout)            (None, None, 120)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 240)               231360    
_________________________________________________________________
dense (Dense)                (None, 120)               28920     
_________________________________________________________________
dropout_1 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 242       
Total params: 12,260,522
Trainable params: 12,260,522
Non-trainable params: 0
____________________________________________

In [15]:
# compile the model
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

# Train

In [16]:
#prep train and validation sets
X_train = preprocess(X_train, tokenizer)
X_valid = preprocess(X_val, tokenizer)

In [17]:
type(y_val)

pandas.core.series.Series

In [22]:
y_train

901     [1]
8178    [1]
4055    [0]
862     [1]
6997    [1]
       ... 
1484    [1]
4757    [1]
7339    [1]
1113    [1]
561     [1]
Name: overall, Length: 6000, dtype: object

In [18]:
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_valid, y_val), verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

# Save model 

In [None]:
with open('models/model.{}.pickle'.format(datetime.now().strftime('%Y_%m_%d__%H_%M')), 'wb') as wf:
    pickle.dump(model, wf)

# Prediction
copy the cell below any time you want to use your model on some data X

In [None]:
with open('models/model.2021_04_16__12_34.pickle','r') as rf:
    model = pickle.load(rf)

In [None]:
def predict(txt):
    seq = tokenizer.texts_to_sequences(txt)
    padded = pad_sequences(seq, maxlen=max_length)
    return model.predict(padded)

In [None]:
predict("test text just to make sure it works.")