## Setup

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Load Dataset
Train / Validation / Test = 7 / 1 / 2

In [None]:
df_train = pd.read_csv('../input/imdb-dataset/train.csv', usecols = ['review','sentiment'])
df_val = pd.read_csv('../input/imdb-dataset/val.csv', usecols = ['review','sentiment'])
df_test = pd.read_csv('../input/imdb-dataset/test.csv', usecols = ['review','sentiment'])

In [None]:
print(df_train.info())
df_train

In [None]:
print(df_val.info())
df_val

In [None]:
print(df_test.info())
df_test

## Metrics

In [None]:
def get_metrics(y_test, y_pred_proba):
    print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba >= 0.5), 4))
    print('F1_SCORE: ', round(f1_score(y_test, y_pred_proba >= 0.5, average = "macro"), 4))
    print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4))
    print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba >= 0.5),'\n')

## Data Cleaning
Ref: https://www.kaggle.com/colearninglounge/nlp-data-preprocessing-and-cleaning

Raw text gives better results than preprocessed text

In [None]:
#Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#Removes HTML syntaxes
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#Removes URL data
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#Removes Emojis
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

df_train['review'] = df_train['review'].apply(lambda z: remove_punctuations(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_html(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_url(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_emoji(z))

df_val['review'] = df_val['review'].apply(lambda z: remove_punctuations(z))
df_val['review'] = df_val['review'].apply(lambda z: remove_html(z))
df_val['review'] = df_val['review'].apply(lambda z: remove_url(z))
df_val['review'] = df_val['review'].apply(lambda z: remove_emoji(z))

df_test['review'] = df_test['review'].apply(lambda z: remove_punctuations(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_html(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_url(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_emoji(z))

In [None]:
def remove_abb(data):
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"there's", "there is", data)
    data = re.sub(r"We're", "We are", data)
    data = re.sub(r"That's", "That is", data)
    data = re.sub(r"won't", "will not", data)
    data = re.sub(r"they're", "they are", data)
    data = re.sub(r"Can't", "Cannot", data)
    data = re.sub(r"wasn't", "was not", data)
    data = re.sub(r"don\x89Ûªt", "do not", data)
    data= re.sub(r"aren't", "are not", data)
    data = re.sub(r"isn't", "is not", data)
    data = re.sub(r"What's", "What is", data)
    data = re.sub(r"haven't", "have not", data)
    data = re.sub(r"hasn't", "has not", data)
    data = re.sub(r"There's", "There is", data)
    data = re.sub(r"He's", "He is", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"You're", "You are", data)
    data = re.sub(r"I'M", "I am", data)
    data = re.sub(r"shouldn't", "should not", data)
    data = re.sub(r"wouldn't", "would not", data)
    data = re.sub(r"i'm", "I am", data)
    data = re.sub(r"I\x89Ûªm", "I am", data)
    data = re.sub(r"I'm", "I am", data)
    data = re.sub(r"Isn't", "is not", data)
    data = re.sub(r"Here's", "Here is", data)
    data = re.sub(r"you've", "you have", data)
    data = re.sub(r"you\x89Ûªve", "you have", data)
    data = re.sub(r"we're", "we are", data)
    data = re.sub(r"what's", "what is", data)
    data = re.sub(r"couldn't", "could not", data)
    data = re.sub(r"we've", "we have", data)
    data = re.sub(r"it\x89Ûªs", "it is", data)
    data = re.sub(r"doesn\x89Ûªt", "does not", data)
    data = re.sub(r"It\x89Ûªs", "It is", data)
    data = re.sub(r"Here\x89Ûªs", "Here is", data)
    data = re.sub(r"who's", "who is", data)
    data = re.sub(r"I\x89Ûªve", "I have", data)
    data = re.sub(r"y'all", "you all", data)
    data = re.sub(r"can\x89Ûªt", "cannot", data)
    data = re.sub(r"would've", "would have", data)
    data = re.sub(r"it'll", "it will", data)
    data = re.sub(r"we'll", "we will", data)
    data = re.sub(r"wouldn\x89Ûªt", "would not", data)
    data = re.sub(r"We've", "We have", data)
    data = re.sub(r"he'll", "he will", data)
    data = re.sub(r"Y'all", "You all", data)
    data = re.sub(r"Weren't", "Were not", data)
    data = re.sub(r"Didn't", "Did not", data)
    data = re.sub(r"they'll", "they will", data)
    data = re.sub(r"they'd", "they would", data)
    data = re.sub(r"DON'T", "DO NOT", data)
    data = re.sub(r"That\x89Ûªs", "That is", data)
    data = re.sub(r"they've", "they have", data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"should've", "should have", data)
    data = re.sub(r"You\x89Ûªre", "You are", data)
    data = re.sub(r"where's", "where is", data)
    data = re.sub(r"Don\x89Ûªt", "Do not", data)
    data = re.sub(r"we'd", "we would", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"weren't", "were not", data)
    data = re.sub(r"They're", "They are", data)
    data = re.sub(r"Can\x89Ûªt", "Cannot", data)
    data = re.sub(r"you\x89Ûªll", "you will", data)
    data = re.sub(r"I\x89Ûªd", "I would", data)
    data = re.sub(r"let's", "let us", data)
    data = re.sub(r"it's", "it is", data)
    data = re.sub(r"can't", "cannot", data)
    data = re.sub(r"don't", "do not", data)
    data = re.sub(r"you're", "you are", data)
    data = re.sub(r"i've", "I have", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"doesn't", "does not",data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"didn't", "did not", data)
    data = re.sub(r"ain't", "am not", data)
    data = re.sub(r"you'll", "you will", data)
    data = re.sub(r"I've", "I have", data)
    data = re.sub(r"Don't", "do not", data)
    data = re.sub(r"I'll", "I will", data)
    data = re.sub(r"I'd", "I would", data)
    data = re.sub(r"Let's", "Let us", data)
    data = re.sub(r"you'd", "You would", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"Ain't", "am not", data)
    data = re.sub(r"Haven't", "Have not", data)
    data = re.sub(r"Could've", "Could have", data)
    data = re.sub(r"youve", "you have", data)  
    data = re.sub(r"donå«t", "do not", data)  
    return data
    
df_train['review'] = df_train['review'].apply(lambda z: remove_abb(z))
df_val['review'] = df_val['review'].apply(lambda z: remove_abb(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_abb(z))

In [None]:
print(df_train.shape)
print(df_train.head(5))
print(df_val.shape)
print(df_val.head(5))
print(df_test.shape)
print(df_test.head(5))

## Model

### LSTM

In [None]:
def LSTM_V0(embedding):
    #...
    return outputs

### BiLSTM

In [None]:
def BiLSTM_V0(embedding):
    net = Bidirectional(LSTM(units=32, return_sequences=True,))(embedding)
    net = GlobalAveragePooling1D()(net)
    net = Dense(20, activation='relu')(net)
    net = Dropout(rate=0.5)(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net) 
    return outputs

### CNN + LSTM

In [None]:
def CNN_LSTM_V0(embedding):
    net = Dropout(0.3)(embedding)
    net = Conv1D(200, 5, activation='relu')(net)
    net = MaxPooling1D(pool_size=2)(net)
    net = LSTM(100)(net)
    net = Dropout(0.3)(net)
    net = Dense(16,activation='relu')(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net)
    return outputs

def CNN_LSTM_V1(embedding):

    # channel 1
    net = Conv1D(filters=128, kernel_size=3*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    a = LSTM(128)(net)

    # channel 2
    net = Conv1D(filters=128, kernel_size=5*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    b = LSTM(128)(net)

    # channel 3
    net = Conv1D(filters=128, kernel_size=7*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    c =LSTM(128)(net)

    # channel 4
    net = Conv1D(filters=128, kernel_size=9*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    d=LSTM(128)(net)

    merged = concatenate([a,b,c,d])
    dense = Dense(100, activation='relu')(merged)
    drop = Dropout(0.2)(dense)
    outputs = Dense(1, activation='sigmoid')(merged)
    return outputs

### LSTM + CNN

In [None]:
def LSTM_CNN_V0(embedding):
    net = LSTM(256,return_sequences=True)(embedding)
    net = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(net)
    net = GlobalMaxPooling1D()(net)
    net = Dropout(0.2)(net)
    net = Dense(64,activation='relu')(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net)
    return outputs

def LSTM_CNN_V1(embedding):
    net = LSTM(512, return_sequences=True,dropout=0.25, recurrent_dropout=0.1)(embedding)
    net = Conv1D(filters=64, kernel_size=7, padding='same', activation='relu', strides=1)(net)
    net = MaxPooling1D()(net)
    net = Conv1D(filters=128, kernel_size=5, padding='same', activation='relu', strides=1)(net)
    net = MaxPooling1D()(net)
    net = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu', strides=1)(net)
    net = MaxPooling1D()(net)
    net = Flatten()(net)
    net = Dense(256,activation='relu')(net)
    net = Dropout(0.2)(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net)
    return outputs

### Choose model

In [None]:
def create_model(model_name, model_ver, max_seq_len, max_features, embed_size, embedding_matrix):

    ## Creat dictionary
    choose_model = {'LSTM':{},
                    'BiLSTM':{0: BiLSTM_V0},
                    'CNN+LSTM':{0: CNN_LSTM_V0, 1: CNN_LSTM_V1},
                    'LSTM+CNN':{0: LSTM_CNN_V0, 1: LSTM_CNN_V1},}
    
    ## Embedding
    inputs = Input(shape=(max_seq_len,))
    embedding = Embedding(max_features,embed_size,weights=[embedding_matrix])(inputs)
    
    outputs = choose_model[model_name][model_ver](embedding)
    model = keras.Model(inputs, outputs)
        
    return model

## Tokenzie

In [None]:
max_seq_len = 500
max_features = 10000
#embed_size = 50

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df_train['review']))
X_train = tokenizer.texts_to_sequences(df_train['review'])
X_val = tokenizer.texts_to_sequences(df_val['review'])
X_test = tokenizer.texts_to_sequences(df_test['review'])

In [None]:
X_train = pad_sequences(X_train, maxlen=max_seq_len)
X_val = pad_sequences(X_val, maxlen=max_seq_len)
X_test = pad_sequences(X_test, maxlen=max_seq_len)
y_train = df_train['sentiment']
y_val = df_val['sentiment']
y_test = df_test['sentiment']

## Embeddings

In [None]:
EMBEDDING_FILE = '../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

## Creat model

In [None]:
model_name = "CNN+LSTM"
model_ver = 0
LR = 1e-3
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = Adam(learning_rate = LR)
metrics = tf.metrics.BinaryAccuracy()

model = create_model(model_name, model_ver, max_seq_len, max_features, embed_size, embedding_matrix)
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()

In [None]:
# Plot architecture model
tf.keras.utils.plot_model(model, show_shapes=True, dpi=96) #to_file='model.png'

## Model training

In [None]:
# Save model
model_ckpt_path = f"[Glove]{model_name}_V{model_ver}_{max_seq_len}_{max_features}_{embed_size}.hdf5"
checkpoint = ModelCheckpoint(model_ckpt_path, monitor='val_binary_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=True)
callbacks_list = [checkpoint]

# Training
print(f"Training model with [Glove]{model_name}_V{model_ver}_{max_seq_len}_{max_features}_{embed_size}\n")
train_history = model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=5, batch_size=32, verbose=1, callbacks=callbacks_list)

In [None]:
# Plot accuracy and loss
history_dict = train_history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
# Save architecture model
config = model.to_json()
model_config_path = f"[Glove]{model_name}_V{model_ver}_{max_seq_len}_{max_features}_{embed_size}.json"
with open(model_config_path, "w") as outfile:
    json.dump(config, outfile)

## Evaluate

In [None]:
model.load_weights(model_ckpt_path)
y_pred_proba = model.predict(X_test)
get_metrics(y_test, y_pred_proba)