## Setup

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.metrics import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load Dataset
Train / Validation / Test = 7 / 1 / 2

In [None]:
df_train = pd.read_csv('../input/imdb-dataset/train.csv', usecols = ['review','sentiment'])
df_val = pd.read_csv('../input/imdb-dataset/val.csv', usecols = ['review','sentiment'])
df_test = pd.read_csv('../input/imdb-dataset/test.csv', usecols = ['review','sentiment'])

In [None]:
print(df_train.info())
df_train

In [None]:
print(df_val.info())
df_val

In [None]:
print(df_test.info())
df_test

## Metrics

In [None]:
def get_metrics(y_test, y_pred_proba):
    print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba >= 0.5), 4))
    print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4))
    print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba >= 0.5),'\n')

## Data Cleaning
Ref: https://www.kaggle.com/colearninglounge/nlp-data-preprocessing-and-cleaning

Raw text gives better results than preprocessed text

## Model

### LSTM

In [None]:
def LSTM_V0(embedding):
    #...
    return outputs

### CNN

In [None]:
def CNN_V0(embedding):
    net = Conv1D(128, 7, activation='relu',padding='same')(embedding)
    net = MaxPooling1D()(net)
    net = Conv1D(256, 5, activation='relu',padding='same')(net)
    net = MaxPooling1D()(net)
    net = Conv1D(512, 3, activation='relu',padding='same')(net)
    net = MaxPooling1D()(net)
    net = Flatten()(net)
    net = Dense(128, activation='relu')(net)
    net = Dropout(0.5)(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net) 
    return outputs

### BiLSTM

In [None]:
def BiLSTM_V0(embedding):
    net = Bidirectional(LSTM(units=32, return_sequences=True))(embedding)
    net = GlobalAveragePooling1D()(net)
    net = Dense(20, activation='relu')(net)
    net = Dropout(rate=0.5)(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net) 
    return outputs

### CNN + LSTM

In [None]:
def CNN_LSTM_V0(embedding):
    net = Dropout(0.3)(embedding)
    net = Conv1D(200, 5, activation='relu')(net)
    net = MaxPooling1D(pool_size=2)(net)
    net = LSTM(100)(net)
    net = Dropout(0.3)(net)
    net = Dense(16,activation='relu')(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net)
    return outputs

def CNN_LSTM_V1(embedding):

    # channel 1
    net = Conv1D(filters=128, kernel_size=3*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    a = LSTM(128)(net)

    # channel 2
    net = Conv1D(filters=128, kernel_size=5*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    b = LSTM(128)(net)

    # channel 3
    net = Conv1D(filters=128, kernel_size=7*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    c =LSTM(128)(net)

    # channel 4
    net = Conv1D(filters=128, kernel_size=9*32, activation='relu')(embedding)
    net = MaxPooling1D(pool_size=2)(net)
    net = Dropout(0.5)(net)
    net = BatchNormalization()(net)
    d=LSTM(128)(net)

    merged = concatenate([a,b,c,d])
    dense = Dense(100, activation='relu')(merged)
    drop = Dropout(0.2)(dense)
    outputs = Dense(1, activation='sigmoid')(merged)
    return outputs

### LSTM + CNN

In [None]:
def LSTM_CNN_V0(embedding):
    net = Bidirectional(LSTM(64, return_sequences=True))(embedding)
    net = Conv1D(128, 7, activation='relu',padding='same')(net)
    net = MaxPooling1D()(net)
    net = Conv1D(256, 5, activation='relu',padding='same')(net)
    net = MaxPooling1D()(net)
    net = Conv1D(512, 3, activation='relu',padding='same')(net)
    net = MaxPooling1D()(net)
    net = Flatten()(net)
    net = Dense(128, activation='relu')(net)
    net = Dropout(0.5)(net)
    outputs = Dense(1, activation='sigmoid', name='classifier')(net) 
    return outputs

### Choose model

In [None]:
def create_model(model_name, model_ver, max_seq_len, max_features, embed_size, embedding_matrix):

    ## Creat dictionary
    choose_model = {'LSTM':{},
                    'CNN':{0: CNN_V0,},
                    'BiLSTM':{0: BiLSTM_V0,},
                    'CNN+LSTM':{0: CNN_LSTM_V0, 1: CNN_LSTM_V1},
                    'LSTM+CNN':{0: LSTM_CNN_V0}}
    
    ## Embedding
    inputs = Input(shape=(max_seq_len,))
    embedding = Embedding(max_features,embed_size,weights=[embedding_matrix])(inputs)
    
    outputs = choose_model[model_name][model_ver](embedding)
    model = keras.Model(inputs, outputs)
        
    return model

## Tokenzie

In [None]:
max_seq_len = 500
max_features = 20000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df_train['review']))
X_train = tokenizer.texts_to_sequences(df_train['review'])
X_val = tokenizer.texts_to_sequences(df_val['review'])
X_test = tokenizer.texts_to_sequences(df_test['review'])

In [None]:
X_train = pad_sequences(X_train, maxlen=max_seq_len)
X_val = pad_sequences(X_val, maxlen=max_seq_len)
X_test = pad_sequences(X_test, maxlen=max_seq_len)
y_train = df_train['sentiment']
y_val = df_val['sentiment']
y_test = df_test['sentiment']

## Embeddings
Ref: https://www.kaggle.com/colearninglounge/nlp-model-building-transformers-attention-more#Build-a-Static-Semantic-Embedding-Neural-Network(LSTM)-Baseline

In [None]:
EMBEDDING_FILE = '../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

## Creat model

In [None]:
model_name = "LSTM+CNN"
model_ver = 0
LR = 1e-3
loss = BinaryCrossentropy(from_logits=True)
optimizer = Adam(learning_rate = LR)
metrics = [BinaryAccuracy(), AUC()]

model = create_model(model_name, model_ver, max_seq_len, max_features, embed_size, embedding_matrix)
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()

In [None]:
# Plot architecture model
tf.keras.utils.plot_model(model, show_shapes=True, dpi=96)

## Model training

In [None]:
# Save model
model_ckpt_path = f"[Glove-200d]{model_name}-V{model_ver}"+"_epoch{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(model_ckpt_path, monitor='val_binary_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=True)
callbacks_list = [checkpoint]

# Training
print(f"Training model with [Glove-200d]{model_name}-V{model_ver}\n")
train_history = model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=5, batch_size=64, verbose=1, callbacks=callbacks_list)

In [None]:
# Plot accuracy and loss
history_dict = train_history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

## Evaluate

In [None]:
ls -d *hdf5

In [None]:
list_path = ["01","02"]
for path in list_path:
    print(f"Epoch {path} \n")
    model_ckpt_path = f"[Glove-200d]{model_name}-V{model_ver}_epoch{path}.hdf5"
    model.load_weights(model_ckpt_path)
    y_pred_proba = model.predict(X_test)
    get_metrics(y_test, y_pred_proba)