In [None]:
# import libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Conv1D, MaxPooling1D, Lambda
from tensorflow.keras.preprocessing.sequence  import pad_sequences

import warnings
warnings.filterwarnings("ignore")

In [None]:
# set constants

# vocabulary size
VOCAB_SIZE = 5000

# test split size
test_size = 0.2

# Word Embedding dimension
EMBED_DIM = 128

In [None]:
# function to get the data and do some preprocessing
def get_features_labels(vocab_size = VOCAB_SIZE, n_stopwords_to_skip = 0):

  # load data set from keras
  IMDB_data = tf.keras.datasets.imdb.load_data(
                  path="imdb.npz",
                  num_words=vocab_size,
                  skip_top = n_stopwords_to_skip,
                  maxlen=None)

  # get the initial data splits from the loaded data
  (X_train, y_train), (X_test, y_test) = IMDB_data

  # concatenate the splits for custom splitting and other data preprocessing
  X = np.concatenate((X_train, X_test), axis = 0)
  y = np.concatenate((y_train, y_test), axis = 0)

  # check weather the number of features are equal to the number of labels
  assert X.shape == y.shape, "Number of features doesn't match the number of labels"

  # post pad the training data to the maximum review
  X_padded = pad_sequences(list(X), padding = 'post')

  return (X_padded, y)


# function to do the train and test splits
def train_test_split_data(X, y, test_size = test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)

  return ((X_train, y_train), (X_test, y_test))

In [None]:
# get X and y 
X, y = get_features_labels()

# Length of each padded word sequence 
seq_length = X.shape[1]

# split the data
(X_train, y_train), (X_test, y_test) = train_test_split_data(X, y, test_size = test_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
# function to build a logistic regression model and print validation and test accuracy scores

def logistic_regression_scores():
  
  # load Logistic Regression classifier
  clf = LogisticRegression(max_iter= 100)
  
  # kfold split
  kfold = KFold(n_splits=5, shuffle=True)

  # kfold cross-validation 
  results = cross_val_score(clf, X_train, y_train, cv=kfold)

  print(f"The average accuracy of Logistic Regression by kfold validation is { results.mean()*100 }% \nThe std is { results.std() }")

  # fit the logistic regression model to the data
  clf.fit(X_train, y_train)
  
  # Get the accuracy scores on test data by logistic regression
  log_test_acc = clf.score(X_test, y_test)

  print(f"The accuracy of Logistic Regression on test data is {log_test_acc*100}% ")


# function to build feed forward neural network
def build_feed_forward_NN_model(embed_dim = EMBED_DIM, vocab_size = VOCAB_SIZE, seq_length = seq_length):
  
  input = Input(shape = (seq_length))
  x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(input)
  x = Flatten()(x)
  x = Dense(64, activation = 'relu')(x)
  x = Dense(32, activation = 'relu')(x)
  x = Dense(16, activation = 'relu')(x)
  output = Dense(1, activation = 'sigmoid')(x)

  model = tf.keras.models.Model(inputs = input, outputs = output)
  
  # compile the model
  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  
  return model

def build_CNN_model(embed_dim = 32, vocab_size = VOCAB_SIZE, seq_length = seq_length):
  
  input = Input(shape = (seq_length))
  x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(input)
  x = Flatten()(x)
  x = Conv1D(16, 5, activation = 'relu')(tf.expand_dims(x, axis = -1))
  x = MaxPooling1D(2, padding = 'same')(x)
  x = Conv1D(32, 5, activation = 'relu')(x)
  x = MaxPooling1D(2, padding = 'same')(x)
  x = Flatten()(x)
  output = Dense(1, activation = 'sigmoid')(x)

  model = tf.keras.models.Model(inputs = input, outputs = output)
  
  # compile the model
  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  
  return model


def NN_scores(build_func, model_type):
  
  # Wrap keras model with scikit-learn to perform other operations
  clf = KerasClassifier(build_fn= build_func, epochs=5, batch_size=128, verbose = 0)

  # kfold split
  kfold = KFold(n_splits=5, shuffle=True)

  # kfold cross-validation 
  results = cross_val_score(clf, X_train, y_train, cv=kfold)

  print(f"The average accuracy of {model_type} by kfold validation is { results.mean()*100 }% \nThe std is { results.std() }")

  # fit the logistic regression model to the data
  clf.fit(X_train, y_train)
  
  # Get the accuracy scores on test data by logistic regression
  test_acc = clf.score(X_test, y_test)

  print(f"The accuracy of {model_type} on test data is {test_acc*100}% ")

In [None]:
logistic_regression_scores()

The average accuracy of Logistic Regression by kfold validation is 50.21% 
The std is 0.0021365860619221477
The accuracy of Logistic Regression on test data is 51.09% 


In [None]:
# summarize the model
build_feed_forward_NN_model().summary()
NN_scores(build_feed_forward_NN_model, "feed_forward_neural_network")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2494)]            0         
                                                                 
 embedding (Embedding)       (None, 2494, 128)         640000    
                                                                 
 flatten (Flatten)           (None, 319232)            0         
                                                                 
 dense (Dense)               (None, 64)                20430912  
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 1)                 17    

In [None]:
# summarize the model
build_CNN_model().summary()
NN_scores(build_CNN_model, "convolutional_neural_network")

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 2494)]            0         
                                                                 
 embedding_7 (Embedding)     (None, 2494, 32)          160000    
                                                                 
 flatten_7 (Flatten)         (None, 79808)             0         
                                                                 
 tf.expand_dims (TFOpLambda)  (None, 79808, 1)         0         
                                                                 
 conv1d (Conv1D)             (None, 79804, 16)         96        
                                                                 
 max_pooling1d (MaxPooling1D  (None, 39902, 16)        0         
 )                                                               
                                                           

In [None]:
# get X and y with stop word filtered
X, y = get_features_labels(n_stopwords_to_skip= 50)

# split the new data data
(X_train, y_train), (X_test, y_test) = train_test_split_data(X, y, test_size = test_size)

In [None]:
# summarize the model
build_CNN_model().summary()
NN_scores(build_CNN_model, "convolutional_neural_network")

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 2494)]            0         
                                                                 
 embedding_14 (Embedding)    (None, 2494, 32)          160000    
                                                                 
 flatten_21 (Flatten)        (None, 79808)             0         
                                                                 
 tf.expand_dims_7 (TFOpLambd  (None, 79808, 1)         0         
 a)                                                              
                                                                 
 conv1d_14 (Conv1D)          (None, 79804, 16)         96        
                                                                 
 max_pooling1d_14 (MaxPoolin  (None, 39902, 16)        0         
 g1D)                                                     