#### A code for medium article "Enriching Sequential LSTM Model with Non-Sequential Features".
https://pub.towardsai.net/enriching-sequential-lstm-model-with-non-sequential-features-7224b5262132

In [None]:
import pandas as pd
import random
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Flatten, Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D, Input, concatenate
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
random.seed(42)

In [None]:
df = pd.read_csv("../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv")

In [None]:
df.head()

## Preprocessing

In [None]:
df = df.fillna("")

In [None]:
df["seq_input"] = df.apply(lambda x: x["Title"] + " " + x["Review Text"], axis=1)

In [None]:
df["split"] = df.apply(lambda x: "train" if random.randrange(0,100) > 10 else "valid", axis=1)

In [None]:
df["split"].value_counts()

In [None]:
df["nonseq_input"] = df.apply(lambda x: [x["Age"]] + [x["Rating"]], axis=1)

In [None]:
df_train = df[df["split"] == "train"]
df_val = df[df["split"] == "valid"]

In [None]:
tokenizer=Tokenizer(oov_token="'oov'")
tokenizer.fit_on_texts(df_train["seq_input"])

In [None]:
maxlen = 200
train_X = pad_sequences(tokenizer.texts_to_sequences(df_train["seq_input"]), maxlen=maxlen)
val_X = pad_sequences(tokenizer.texts_to_sequences(df_val["seq_input"]), maxlen=maxlen)

In [None]:
train_Y = df_train["Recommended IND"]
val_Y = df_val["Recommended IND"]

In [None]:
glove_dir="/kaggle/input/glove-global-vectors-for-word-representation/"

embedding_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print('Found %s word vectors ' % len(embedding_index))

In [None]:
max_words = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((max_words,embedding_dim))

for word, idx in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

## Sequential features only
Predicting whether a customer recommends the product based solely on his text review.

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(32))(x)
x = Dense(32, activation="relu")(x)
out = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=out)
model.compile(optimizer="Adam", loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(train_X, train_Y, epochs=30, batch_size=512, validation_data=(val_X, val_Y))

In [None]:
val_pred = [p > 0.5 for p in model.predict(val_X)]

In [None]:
print(classification_report(val_Y, val_pred))

## Sequential + non-sequential features
Predicting whether a customer recommends the product based on his text review, age, and provided rating.

The rating obviously strongly correlates with the recommendation. Normally, we would probably not have such a strong feature.

In [None]:
df_train.corr()

In [None]:
train_X_nonseq = np.asarray(list(df_train["nonseq_input"]))
val_X_nonseq = np.asarray(list(df_val["nonseq_input"]))

In [None]:
seq_inp = Input(shape=(maxlen,))
nonseq_inp = Input(shape=(train_X_nonseq.shape[1],))
x = Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False)(seq_inp)
x = Bidirectional(LSTM(32))(x)
x = concatenate([x, nonseq_inp])
x = Dense(32, activation="relu")(x)
out = Dense(1, activation="sigmoid")(x)
model = Model(inputs=[seq_inp, nonseq_inp], outputs=out)
model.compile(optimizer="Adam", loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit([train_X, train_X_nonseq], train_Y, epochs=30, batch_size=512, validation_data=([val_X, val_X_nonseq], val_Y))

In [None]:
val_pred = [p > 0.5 for p in model.predict([val_X, val_X_nonseq])]

In [None]:
print(classification_report(val_Y, val_pred))